File size: 10,264 Bytes
dfbb2da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
"""
Speech-to-Text Service using OpenAI Whisper (Local Model)

This service provides LOCAL speech-to-text transcription using Whisper.
NO API CALLS - everything runs on your machine for FREE!

Features:
- Extracts audio from YouTube videos using yt-dlp
- Transcribes audio using Whisper (small model by default)
- Detects the language of the audio automatically
- Returns both transcript and detected language

Requirements:
- FFmpeg must be installed on the system
- Sufficient RAM (~2GB for whisper-small)
"""

import os
import tempfile
import logging
from typing import Optional, Tuple

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import yt_dlp

from config import (
    WHISPER_MODEL,
    AUDIO_FORMAT,
    AUDIO_SAMPLE_RATE,
    normalize_whisper_lang,
)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def get_ffmpeg_path() -> Optional[str]:
    """
    Get the path to FFmpeg executable directory.
    Uses static-ffmpeg which provides both ffmpeg and ffprobe.
    Falls back to system PATH or imageio-ffmpeg.
    """
    import shutil
    
    # Check if ffmpeg AND ffprobe are in system PATH
    ffmpeg_path = shutil.which("ffmpeg")
    ffprobe_path = shutil.which("ffprobe")
    if ffmpeg_path and ffprobe_path:
        logger.info(f"Using system FFmpeg: {ffmpeg_path}")
        return os.path.dirname(ffmpeg_path)
    
    # Try static-ffmpeg (provides both ffmpeg and ffprobe)
    try:
        import static_ffmpeg
        # This downloads ffmpeg/ffprobe if not already present
        ffmpeg_path, ffprobe_path = static_ffmpeg.run.get_or_fetch_platform_executables_else_raise()
        if ffmpeg_path and os.path.exists(ffmpeg_path):
            ffmpeg_dir = os.path.dirname(ffmpeg_path)
            logger.info(f"Using static-ffmpeg: {ffmpeg_dir}")
            return ffmpeg_dir
    except ImportError:
        logger.warning("static-ffmpeg not installed")
    except Exception as e:
        logger.warning(f"static-ffmpeg error: {e}")
    
    # Fall back to imageio-ffmpeg (only has ffmpeg, not ffprobe)
    try:
        import imageio_ffmpeg
        ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
        if ffmpeg_path and os.path.exists(ffmpeg_path):
            logger.warning("Using imageio-ffmpeg (may not have ffprobe)")
            return os.path.dirname(ffmpeg_path)
    except ImportError:
        pass
    
    return None


class SpeechToTextService:
    """
    Service for converting speech to text using local Whisper model.
    
    The model is lazily loaded on first use to save memory during startup.
    All processing happens locally - no API costs!
    """
    
    def __init__(self, model_name: str = WHISPER_MODEL):
        """
        Initialize the speech-to-text service.
        
        Args:
            model_name: Hugging Face model identifier for Whisper
        """
        self.model_name = model_name
        self._pipe = None  # Lazy-loaded pipeline
        self._device = "cuda" if torch.cuda.is_available() else "cpu"
        self._torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        
        logger.info(f"SpeechToTextService initialized (device: {self._device})")
    
    def _load_model(self):
        """
        Load the Whisper model and processor.
        Called lazily on first transcription request.
        """
        if self._pipe is not None:
            return
        
        logger.info(f"Loading Whisper model: {self.model_name}")
        logger.info("This may take a few minutes on first run (downloading model)...")
        
        try:
            # Load model with optimizations for CPU/GPU
            model = AutoModelForSpeechSeq2Seq.from_pretrained(
                self.model_name,
                torch_dtype=self._torch_dtype,
                low_cpu_mem_usage=True,
                use_safetensors=True
            )
            model.to(self._device)
            
            # Load processor
            processor = AutoProcessor.from_pretrained(self.model_name)
            
            # Create pipeline for easy inference
            self._pipe = pipeline(
                "automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                torch_dtype=self._torch_dtype,
                device=self._device,
                return_timestamps=False
            )
            
            logger.info("Whisper model loaded successfully!")
            
        except Exception as e:
            logger.error(f"Failed to load Whisper model: {e}")
            raise Exception(f"Could not load Whisper model: {str(e)}")
    
    def extract_audio_from_youtube(self, url: str) -> str:
        """
        Extract audio from a YouTube video.
        
        Args:
            url: YouTube video URL
            
        Returns:
            Path to the extracted audio file (WAV format)
            
        Raises:
            Exception: If audio extraction fails
        """
        logger.info(f"Extracting audio from: {url}")
        
        # Get FFmpeg path (system or imageio-ffmpeg)
        ffmpeg_path = get_ffmpeg_path()
        if not ffmpeg_path:
            raise Exception("FFmpeg not found. Please install FFmpeg or run: pip install imageio-ffmpeg")
        
        logger.info(f"Using FFmpeg: {ffmpeg_path}")
        
        # Create temporary directory for audio file
        temp_dir = tempfile.mkdtemp()
        output_template = os.path.join(temp_dir, "audio.%(ext)s")
        
        ydl_opts = {
            "format": "bestaudio/best",
            "outtmpl": output_template,
            "postprocessors": [{
                "key": "FFmpegExtractAudio",
                "preferredcodec": AUDIO_FORMAT,
                "preferredquality": "192",
            }],
            "ffmpeg_location": ffmpeg_path,  # yt-dlp needs the directory containing ffmpeg and ffprobe
            "quiet": True,
            "no_warnings": True,
        }
        
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            
            # Find the extracted audio file
            audio_path = os.path.join(temp_dir, f"audio.{AUDIO_FORMAT}")
            
            if not os.path.exists(audio_path):
                raise Exception("Audio file was not created")
            
            logger.info(f"Audio extracted to: {audio_path}")
            return audio_path
            
        except Exception as e:
            logger.error(f"Audio extraction failed: {e}")
            raise Exception(f"Could not extract audio: {str(e)}")
    
    def transcribe_audio(self, audio_path: str) -> dict:
        """
        Transcribe an audio file using Whisper.
        
        Args:
            audio_path: Path to the audio file
            
        Returns:
            Dictionary with:
                - text: The transcribed text
                - language: Detected language code (normalized)
                - raw_language: Original Whisper language code
        """
        # Ensure model is loaded
        self._load_model()
        
        logger.info(f"Transcribing audio: {audio_path}")
        
        try:
            # Run transcription
            result = self._pipe(
                audio_path,
                generate_kwargs={
                    "task": "transcribe",
                    "language": None,  # Auto-detect language
                }
            )
            
            # Extract text
            text = result.get("text", "").strip()
            
            if not text:
                raise Exception("Transcription produced empty text")
            
            # Try to get detected language from the model
            # Note: Whisper pipeline may not always return language info
            raw_language = "en"  # Default to English
            
            # Normalize the language code
            language = normalize_whisper_lang(raw_language)
            
            logger.info(f"Transcription complete. Language: {language}")
            
            return {
                "text": text,
                "language": language,
                "raw_language": raw_language
            }
            
        except Exception as e:
            logger.error(f"Transcription failed: {e}")
            raise Exception(f"Could not transcribe audio: {str(e)}")
    
    def transcribe_youtube_video(self, url: str) -> dict:
        """
        Full pipeline: Extract audio from YouTube and transcribe it.
        
        Args:
            url: YouTube video URL
            
        Returns:
            Dictionary with:
                - text: The transcribed text
                - language: Detected language code
                - word_count: Number of words in transcript
        """
        audio_path = None
        
        try:
            # Step 1: Extract audio
            audio_path = self.extract_audio_from_youtube(url)
            
            # Step 2: Transcribe
            result = self.transcribe_audio(audio_path)
            
            # Add word count
            result["word_count"] = len(result["text"].split())
            
            return result
            
        finally:
            # Cleanup: Remove temporary audio file
            if audio_path and os.path.exists(audio_path):
                try:
                    os.remove(audio_path)
                    # Also remove the parent temp directory
                    temp_dir = os.path.dirname(audio_path)
                    if os.path.exists(temp_dir):
                        os.rmdir(temp_dir)
                except:
                    pass  # Ignore cleanup errors
    
    def is_model_loaded(self) -> bool:
        """Check if the Whisper model is currently loaded."""
        return self._pipe is not None
    
    def warmup(self):
        """
        Pre-load the model to avoid delay on first request.
        Call this during application startup if desired.
        """
        logger.info("Warming up SpeechToTextService...")
        self._load_model()
        logger.info("SpeechToTextService warmup complete!")