Spaces:
Sleeping
Sleeping
| import argparse | |
| import json | |
| import threading | |
| import time | |
| from pathlib import Path | |
| from typing import List | |
| import websocket | |
| import os | |
| import librosa | |
| import numpy as np | |
| # Define the default WebSocket endpoint | |
| DEFAULT_WS_URL = "ws://localhost:8000/v1/ws_transcribe_streaming" | |
| def parse_arguments(): | |
| parser = argparse.ArgumentParser(description="Stream audio to the transcription WebSocket endpoint.") | |
| parser.add_argument("audio_file", help="Path to the input audio file.") | |
| parser.add_argument("--url", default=DEFAULT_WS_URL, help="WebSocket endpoint URL.") | |
| parser.add_argument("--model", type=str, help="Model name to use for transcription.") | |
| parser.add_argument("--language", type=str, help="Language code for transcription.") | |
| parser.add_argument( | |
| "--response_format", | |
| type=str, | |
| default="verbose_json", | |
| choices=["text", "json", "verbose_json"], | |
| help="Response format.", | |
| ) | |
| parser.add_argument("--temperature", type=float, default=0.0, help="Temperature for transcription.") | |
| parser.add_argument("--vad_filter", action="store_true", help="Enable voice activity detection filter.") | |
| parser.add_argument("--chunk_duration", type=float, default=1.0, help="Duration of each audio chunk in seconds.") | |
| return parser.parse_args() | |
| # def preprocess_audio(audio_file, target_sr=16000): | |
| # """ | |
| # Load the audio file, convert to mono 16kHz, and return the audio data. | |
| # """ | |
| # if audio_file.endswith(".mp3"): | |
| # # Convert MP3 to WAV using ffmpeg | |
| # wav_file = audio_file.replace(".mp3", ".wav") | |
| # if not os.path.exists(wav_file): | |
| # command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"' | |
| # print(f"Converting MP3 to WAV: {command}") | |
| # os.system(command) | |
| # audio_file = wav_file | |
| # | |
| # print(f"Loading audio file {audio_file}") | |
| # audio_data, sr = librosa.load(audio_file, sr=target_sr, mono=True) | |
| # return audio_data, sr | |
| # | |
| # def chunk_audio(audio_data, sr, chunk_duration): | |
| # """ | |
| # Split the audio data into chunks of specified duration. | |
| # """ | |
| # chunk_samples = int(chunk_duration * sr) | |
| # total_samples = len(audio_data) | |
| # chunks = [ | |
| # audio_data[i:i + chunk_samples] | |
| # for i in range(0, total_samples, chunk_samples) | |
| # ] | |
| # print(f"Split audio into {len(chunks)} chunks of {chunk_duration} seconds each.") | |
| # return chunks | |
| def read_audio_in_chunks(audio_file, target_sr=16000, chunk_duration=1) -> List[np.ndarray]: | |
| """ | |
| Reads a 16kHz mono audio file in 1-second chunks and returns them as little-endian 16-bit integer arrays. | |
| Args: | |
| file_path (str): Path to the audio file. | |
| expected_sr (int): Expected sample rate (16000 by default). | |
| expected_mono (bool): Expect the file to be mono (True by default). | |
| chunk_duration (int): Duration of each chunk in seconds (1 second by default). | |
| Returns: | |
| List of numpy arrays: Each array is a 1-second chunk of the audio as 16-bit integers. | |
| Raises: | |
| ValueError: If the audio file's sample rate or number of channels doesn't match expectations. | |
| """ | |
| if not str(audio_file).endswith(".wav"): | |
| # Convert MP3 to WAV using ffmpeg | |
| wav_file = Path(audio_file).with_suffix(".wav") | |
| if not wav_file.exists(): | |
| command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"' | |
| print(f"Converting MP3 to WAV: {command}") | |
| os.system(command) | |
| audio_file = wav_file | |
| # Load the audio file | |
| audio_data, sr = librosa.load(audio_file, sr=None, mono=True) | |
| # Raise an exception if the sample rate doesn't match | |
| if sr != target_sr: | |
| raise ValueError(f"Unexpected sample rate {sr}. Expected {target_sr}.") | |
| # Convert the audio data to 16-bit PCM (little-endian) | |
| audio_data_int16 = (audio_data * 32767).astype(np.int16) | |
| # Check if the current byte order is little-endian | |
| if audio_data_int16.dtype.byteorder == '>' or ( | |
| audio_data_int16.dtype.byteorder == '=' and np.dtype(np.int16).byteorder == '>'): | |
| print("Byte swap performed to convert to little-endian.") | |
| # Ensure little-endian format (if the current format is big-endian) | |
| audio_data_little_endian = audio_data_int16.byteswap().newbyteorder('L') | |
| else: | |
| print("No byte swap needed. Already little-endian.") | |
| audio_data_little_endian = audio_data_int16 | |
| # Calculate the number of samples per chunk | |
| samples_per_chunk = target_sr * chunk_duration | |
| # Split the audio into chunks | |
| chunks = [ | |
| audio_data_little_endian[i:i + samples_per_chunk] | |
| for i in range(0, len(audio_data_little_endian), samples_per_chunk) | |
| ] | |
| return chunks | |
| def build_query_params(args): | |
| """ | |
| Build the query parameters for the WebSocket URL based on command-line arguments. | |
| """ | |
| params = {} | |
| if args.model: | |
| params["model"] = args.model | |
| if args.language: | |
| params["language"] = args.language | |
| if args.response_format: | |
| params["response_format"] = args.response_format | |
| if args.temperature is not None: | |
| params["temperature"] = str(args.temperature) | |
| if args.vad_filter: | |
| params["vad_filter"] = "true" | |
| return params | |
| def websocket_url_with_params(base_url, params): | |
| """ | |
| Append query parameters to the WebSocket URL. | |
| """ | |
| from urllib.parse import urlencode | |
| if params: | |
| query_string = urlencode(params) | |
| url = f"{base_url}?{query_string}" | |
| else: | |
| url = base_url | |
| return url | |
| def on_message(ws, message): | |
| """ | |
| Callback function when a message is received from the server. | |
| """ | |
| try: | |
| data = json.loads(message) | |
| # Accumulate transcriptions | |
| if ws.args.response_format == "verbose_json": | |
| segments = data.get('segments', []) | |
| ws.transcriptions.extend(segments) | |
| for segment in segments: | |
| print(f"Received segment: {segment['text']}") | |
| else: | |
| # For 'json' or 'text' format | |
| ws.transcriptions.append(data) | |
| print(f"Transcription: {data['text']}") | |
| except json.JSONDecodeError: | |
| print(f"Received non-JSON message: {message}") | |
| def on_error(ws, error): | |
| """ | |
| Callback function when an error occurs. | |
| """ | |
| print(f"WebSocket error: {error}") | |
| def on_close(ws, close_status_code, close_msg): | |
| """ | |
| Callback function when the WebSocket connection is closed. | |
| """ | |
| print("WebSocket connection closed") | |
| def on_open(ws): | |
| """ | |
| Callback function when the WebSocket connection is opened. | |
| """ | |
| print("WebSocket connection opened") | |
| ws.transcriptions = [] # Initialize the list to store transcriptions | |
| def send_audio_chunks(ws, audio_chunks, sr): | |
| """ | |
| Send audio chunks to the WebSocket server. | |
| """ | |
| for idx, chunk in enumerate(audio_chunks): | |
| # Ensure little-endian format | |
| audio_bytes = chunk.astype('<f4').tobytes() | |
| ws.send(audio_bytes, opcode=websocket.ABNF.OPCODE_BINARY) | |
| print(f"Sent chunk {idx + 1}/{len(audio_chunks)}") | |
| time.sleep(0.1) # Small delay to simulate real-time streaming | |
| print("All audio chunks sent") | |
| # Optionally, wait to receive remaining messages | |
| time.sleep(2) | |
| ws.close() | |
| print("Closed WebSocket connection") | |
| def format_timestamp(seconds): | |
| """ | |
| Convert seconds to SRT timestamp format (HH:MM:SS,mmm). | |
| """ | |
| total_milliseconds = int(seconds * 1000) | |
| hours = total_milliseconds // (3600 * 1000) | |
| minutes = (total_milliseconds % (3600 * 1000)) // (60 * 1000) | |
| secs = (total_milliseconds % (60 * 1000)) // 1000 | |
| milliseconds = total_milliseconds % 1000 | |
| return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}" | |
| def generate_srt(transcriptions): | |
| """ | |
| Generate and print SRT content from transcriptions. | |
| """ | |
| print("\nGenerated SRT:") | |
| for idx, segment in enumerate(transcriptions, 1): | |
| start_time = format_timestamp(segment['start']) | |
| end_time = format_timestamp(segment['end']) | |
| text = segment['text'] | |
| print(f"{idx}") | |
| print(f"{start_time} --> {end_time}") | |
| print(f"{text}\n") | |
| def run_websocket_client(args): | |
| """ | |
| Run the WebSocket client to stream audio and receive transcriptions. | |
| """ | |
| try: | |
| audio_chunks = read_audio_in_chunks(args.audio_file) | |
| # params = build_query_params(args) | |
| # ws_url = websocket_url_with_params(args.url, params) | |
| ws_url = args.url | |
| ws = websocket.WebSocketApp( | |
| ws_url, | |
| on_open=on_open, | |
| on_message=on_message, | |
| on_error=on_error, | |
| on_close=on_close, | |
| ) | |
| ws.args = args # Attach args to ws to access inside callbacks | |
| # Run the WebSocket in a separate thread to allow sending and receiving simultaneously | |
| ws_thread = threading.Thread(target=ws.run_forever) | |
| ws_thread.start() | |
| # Wait for the connection to open | |
| while not ws.sock or not ws.sock.connected: | |
| time.sleep(0.1) | |
| # Send the audio chunks | |
| send_audio_chunks(ws, audio_chunks, 16000) | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| # Wait for the WebSocket thread to finish | |
| ws_thread.join() | |
| # Generate SRT if transcriptions are available | |
| if hasattr(ws, 'transcriptions') and ws.transcriptions: | |
| generate_srt(ws.transcriptions) | |
| else: | |
| print("No transcriptions received.") | |
| if __name__ == "__main__": | |
| args = parse_arguments() | |
| run_websocket_client(args) | |