import gradio as gr import os import requests import base64 import pathlib import threading import tempfile from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat import dashscope import wave import numpy as np # ======= Constants Configuration ======= DEFAULT_TARGET_MODEL = "qwen3-tts-vc-realtime-2025-11-27" DEFAULT_PREFERRED_NAME = "custom_voice" DEFAULT_AUDIO_MIME_TYPE = "audio/wav" def init_dashscope_api_key(): """Initialize the API key for dashscope SDK""" api_key = os.environ['API_KEY'] if not api_key: raise ValueError("Please set the environment variable DASHSCOPE_API_KEY") dashscope.api_key = api_key return api_key def create_voice(file_path: str, target_model: str = DEFAULT_TARGET_MODEL, preferred_name: str = DEFAULT_PREFERRED_NAME, audio_mime_type: str = DEFAULT_AUDIO_MIME_TYPE) -> str: """Create voice and return the voice parameter""" api_key = os.environ['API_KEY'] file_path_obj = pathlib.Path(file_path) if not file_path_obj.exists(): raise FileNotFoundError(f"Audio file not found: {file_path}") base64_str = base64.b64encode(file_path_obj.read_bytes()).decode() data_uri = f"data:{audio_mime_type};base64,{base64_str}" url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization" payload = { "model": "qwen-voice-enrollment", "input": { "action": "create", "target_model": target_model, "preferred_name": preferred_name, "audio": {"data": data_uri} } } headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } # Create session and configure retry and SSL session = requests.Session() # If SSL errors persist, temporarily change to False (for testing only) session.verify = True # Enable SSL verification # Configure retry strategy from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["POST"] ) adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("https://", adapter) session.mount("http://", adapter) try: resp = session.post(url, json=payload, headers=headers, timeout=60) if resp.status_code != 200: raise RuntimeError(f"Failed to create voice: {resp.status_code}, {resp.text}") return resp.json()["output"]["voice"] except requests.exceptions.SSLError as e: raise RuntimeError(f"SSL connection error: {e}. Please check network environment or try using a proxy") except requests.exceptions.Timeout as e: raise RuntimeError(f"Request timeout: {e}") except (KeyError, ValueError) as e: raise RuntimeError(f"Failed to parse voice response: {e}") finally: session.close() class TTSCallback(QwenTtsRealtimeCallback): """TTS streaming callback for collecting audio data""" def __init__(self): self.complete_event = threading.Event() self.audio_chunks = [] self.error_msg = None def on_open(self) -> None: print('[TTS] Connection established') def on_close(self, close_status_code, close_msg) -> None: print(f'[TTS] Connection closed code={close_status_code}, msg={close_msg}') def on_event(self, response: dict) -> None: try: event_type = response.get('type', '') if event_type == 'session.created': print(f'[TTS] Session started: {response["session"]["id"]}') elif event_type == 'response.audio.delta': audio_data = base64.b64decode(response['delta']) self.audio_chunks.append(audio_data) elif event_type == 'response.done': print('[TTS] Response completed') elif event_type == 'session.finished': print('[TTS] Session finished') self.complete_event.set() except Exception as e: self.error_msg = str(e) print(f'[Error] Exception while processing callback event: {e}') self.complete_event.set() def wait_for_finished(self): self.complete_event.wait() def get_audio_data(self): """Return the synthesized audio data""" return b''.join(self.audio_chunks) def synthesize_speech(audio_file, text_input): """ Main function for speech synthesis Args: audio_file: Path to the recorded audio file (from Gradio audio component) text_input: Text to synthesize Returns: Path to the synthesized audio file """ try: if not audio_file: return None, "❌ Please record a voice sample first" if not text_input or text_input.strip() == "": return None, "❌ Please enter the text to synthesize" # Initialize API Key init_dashscope_api_key() # Create voice clone status_msg = "🎤 Creating voice clone..." print(status_msg) voice_id = create_voice(audio_file, audio_mime_type="audio/wav") # Initialize TTS status_msg = "🔊 Synthesizing speech..." print(status_msg) callback = TTSCallback() qwen_tts_realtime = QwenTtsRealtime( model=DEFAULT_TARGET_MODEL, callback=callback, url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime' ) qwen_tts_realtime.connect() # Update session configuration qwen_tts_realtime.update_session( voice=voice_id, response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, mode='server_commit' ) # Send text qwen_tts_realtime.append_text(text_input) qwen_tts_realtime.finish() # Wait for completion callback.wait_for_finished() if callback.error_msg: return None, f"❌ Synthesis failed: {callback.error_msg}" # Get audio data and save as WAV file audio_data = callback.get_audio_data() if not audio_data: return None, "❌ No audio data generated" # Create temporary file to save audio with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: output_path = tmp_file.name # Write WAV file header with wave.open(output_path, 'wb') as wav_file: wav_file.setnchannels(1) # Mono wav_file.setsampwidth(2) # 16bit wav_file.setframerate(24000) # 24kHz wav_file.writeframes(audio_data) success_msg = f"✅ Synthesis successful! Session ID: {qwen_tts_realtime.get_session_id()}" print(success_msg) return output_path, success_msg except Exception as e: error_msg = f"❌ An error occurred: {str(e)}" print(error_msg) return None, error_msg # ======= Gradio Interface ======= def create_gradio_interface(): """Create Gradio interface""" with gr.Blocks(title="Qwen Voice Cloning and Synthesis", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎙️ Qwen Voice Cloning and Synthesis **Usage Steps:** 1. Click the microphone icon to record a voice sample (recommended 10-30 seconds, clear and natural) 2. Enter the text content to synthesize 3. Click the "Start Synthesis" button 4. Wait for synthesis to complete, then play or download the result **Notes:** - Please ensure the environment variable `DASHSCOPE_API_KEY` is set - Better recording quality leads to better synthesis results """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Step 1: Record Voice Sample") audio_input = gr.Audio( sources=["microphone"], type="filepath", label="Record Voice", format="wav" ) gr.Markdown("### Step 2: Enter Text to Synthesize") text_input = gr.Textbox( label="Text to Synthesize", placeholder="Please enter the text content to synthesize...", lines=5, value="Hello, this is a voice synthesized using voice cloning technology." ) submit_btn = gr.Button("🎵 Start Synthesis", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### Synthesis Result") status_output = gr.Textbox( label="Status Information", interactive=False, lines=2 ) audio_output = gr.Audio( label="Synthesized Voice", type="filepath" ) # Bind events submit_btn.click( fn=synthesize_speech, inputs=[audio_input, text_input], outputs=[audio_output, status_output] ) gr.Markdown(""" --- 💡 **Tip:** For better results, please ensure a quiet recording environment and clear, natural pronunciation. """) return demo if __name__ == "__main__": # Check API Key try: init_dashscope_api_key() print("✅ API Key verified successfully") except ValueError as e: print(f"⚠️ Warning: {e}") print("Please set the environment variable: export DASHSCOPE_API_KEY='your-api-key'") demo = create_gradio_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False )