Spaces:

Krishna346
/

Youtube-summarizer-api

Running

File size: 10,264 Bytes

dfbb2da

"""
Speech-to-Text Service using OpenAI Whisper (Local Model)

This service provides LOCAL speech-to-text transcription using Whisper.
NO API CALLS - everything runs on your machine for FREE!

Features:
- Extracts audio from YouTube videos using yt-dlp
- Transcribes audio using Whisper (small model by default)
- Detects the language of the audio automatically
- Returns both transcript and detected language

Requirements:
- FFmpeg must be installed on the system
- Sufficient RAM (~2GB for whisper-small)
"""

import os
import tempfile
import logging
from typing import Optional, Tuple

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import yt_dlp

from config import (
    WHISPER_MODEL,
    AUDIO_FORMAT,
    AUDIO_SAMPLE_RATE,
    normalize_whisper_lang,
)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def get_ffmpeg_path() -> Optional[str]:
    """
    Get the path to FFmpeg executable directory.
    Uses static-ffmpeg which provides both ffmpeg and ffprobe.
    Falls back to system PATH or imageio-ffmpeg.
    """
    import shutil
    
    # Check if ffmpeg AND ffprobe are in system PATH
    ffmpeg_path = shutil.which("ffmpeg")
    ffprobe_path = shutil.which("ffprobe")
    if ffmpeg_path and ffprobe_path:
        logger.info(f"Using system FFmpeg: {ffmpeg_path}")
        return os.path.dirname(ffmpeg_path)
    
    # Try static-ffmpeg (provides both ffmpeg and ffprobe)
    try:
        import static_ffmpeg
        # This downloads ffmpeg/ffprobe if not already present
        ffmpeg_path, ffprobe_path = static_ffmpeg.run.get_or_fetch_platform_executables_else_raise()
        if ffmpeg_path and os.path.exists(ffmpeg_path):
            ffmpeg_dir = os.path.dirname(ffmpeg_path)
            logger.info(f"Using static-ffmpeg: {ffmpeg_dir}")
            return ffmpeg_dir
    except ImportError:
        logger.warning("static-ffmpeg not installed")
    except Exception as e:
        logger.warning(f"static-ffmpeg error: {e}")
    
    # Fall back to imageio-ffmpeg (only has ffmpeg, not ffprobe)
    try:
        import imageio_ffmpeg
        ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
        if ffmpeg_path and os.path.exists(ffmpeg_path):
            logger.warning("Using imageio-ffmpeg (may not have ffprobe)")
            return os.path.dirname(ffmpeg_path)
    except ImportError:
        pass
    
    return None


class SpeechToTextService:
    """
    Service for converting speech to text using local Whisper model.
    
    The model is lazily loaded on first use to save memory during startup.
    All processing happens locally - no API costs!
    """
    
    def __init__(self, model_name: str = WHISPER_MODEL):
        """
        Initialize the speech-to-text service.
        
        Args:
            model_name: Hugging Face model identifier for Whisper
        """
        self.model_name = model_name
        self._pipe = None  # Lazy-loaded pipeline
        self._device = "cuda" if torch.cuda.is_available() else "cpu"
        self._torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        
        logger.info(f"SpeechToTextService initialized (device: {self._device})")
    
    def _load_model(self):
        """
        Load the Whisper model and processor.
        Called lazily on first transcription request.
        """
        if self._pipe is not None:
            return
        
        logger.info(f"Loading Whisper model: {self.model_name}")
        logger.info("This may take a few minutes on first run (downloading model)...")
        
        try:
            # Load model with optimizations for CPU/GPU
            model = AutoModelForSpeechSeq2Seq.from_pretrained(
                self.model_name,
                torch_dtype=self._torch_dtype,
                low_cpu_mem_usage=True,
                use_safetensors=True
            )
            model.to(self._device)
            
            # Load processor
            processor = AutoProcessor.from_pretrained(self.model_name)
            
            # Create pipeline for easy inference
            self._pipe = pipeline(
                "automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                torch_dtype=self._torch_dtype,
                device=self._device,
                return_timestamps=False
            )
            
            logger.info("Whisper model loaded successfully!")
            
        except Exception as e:
            logger.error(f"Failed to load Whisper model: {e}")
            raise Exception(f"Could not load Whisper model: {str(e)}")
    
    def extract_audio_from_youtube(self, url: str) -> str:
        """
        Extract audio from a YouTube video.
        
        Args:
            url: YouTube video URL
            
        Returns:
            Path to the extracted audio file (WAV format)
            
        Raises:
            Exception: If audio extraction fails
        """
        logger.info(f"Extracting audio from: {url}")
        
        # Get FFmpeg path (system or imageio-ffmpeg)
        ffmpeg_path = get_ffmpeg_path()
        if not ffmpeg_path:
            raise Exception("FFmpeg not found. Please install FFmpeg or run: pip install imageio-ffmpeg")
        
        logger.info(f"Using FFmpeg: {ffmpeg_path}")
        
        # Create temporary directory for audio file
        temp_dir = tempfile.mkdtemp()
        output_template = os.path.join(temp_dir, "audio.%(ext)s")
        
        ydl_opts = {
            "format": "bestaudio/best",
            "outtmpl": output_template,
            "postprocessors": [{
                "key": "FFmpegExtractAudio",
                "preferredcodec": AUDIO_FORMAT,
                "preferredquality": "192",
            }],
            "ffmpeg_location": ffmpeg_path,  # yt-dlp needs the directory containing ffmpeg and ffprobe
            "quiet": True,
            "no_warnings": True,
        }
        
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            
            # Find the extracted audio file
            audio_path = os.path.join(temp_dir, f"audio.{AUDIO_FORMAT}")
            
            if not os.path.exists(audio_path):
                raise Exception("Audio file was not created")
            
            logger.info(f"Audio extracted to: {audio_path}")
            return audio_path
            
        except Exception as e:
            logger.error(f"Audio extraction failed: {e}")
            raise Exception(f"Could not extract audio: {str(e)}")
    
    def transcribe_audio(self, audio_path: str) -> dict:
        """
        Transcribe an audio file using Whisper.
        
        Args:
            audio_path: Path to the audio file
            
        Returns:
            Dictionary with:
                - text: The transcribed text
                - language: Detected language code (normalized)
                - raw_language: Original Whisper language code
        """
        # Ensure model is loaded
        self._load_model()
        
        logger.info(f"Transcribing audio: {audio_path}")
        
        try:
            # Run transcription
            result = self._pipe(
                audio_path,
                generate_kwargs={
                    "task": "transcribe",
                    "language": None,  # Auto-detect language
                }
            )
            
            # Extract text
            text = result.get("text", "").strip()
            
            if not text:
                raise Exception("Transcription produced empty text")
            
            # Try to get detected language from the model
            # Note: Whisper pipeline may not always return language info
            raw_language = "en"  # Default to English
            
            # Normalize the language code
            language = normalize_whisper_lang(raw_language)
            
            logger.info(f"Transcription complete. Language: {language}")
            
            return {
                "text": text,
                "language": language,
                "raw_language": raw_language
            }
            
        except Exception as e:
            logger.error(f"Transcription failed: {e}")
            raise Exception(f"Could not transcribe audio: {str(e)}")
    
    def transcribe_youtube_video(self, url: str) -> dict:
        """
        Full pipeline: Extract audio from YouTube and transcribe it.
        
        Args:
            url: YouTube video URL
            
        Returns:
            Dictionary with:
                - text: The transcribed text
                - language: Detected language code
                - word_count: Number of words in transcript
        """
        audio_path = None
        
        try:
            # Step 1: Extract audio
            audio_path = self.extract_audio_from_youtube(url)
            
            # Step 2: Transcribe
            result = self.transcribe_audio(audio_path)
            
            # Add word count
            result["word_count"] = len(result["text"].split())
            
            return result
            
        finally:
            # Cleanup: Remove temporary audio file
            if audio_path and os.path.exists(audio_path):
                try:
                    os.remove(audio_path)
                    # Also remove the parent temp directory
                    temp_dir = os.path.dirname(audio_path)
                    if os.path.exists(temp_dir):
                        os.rmdir(temp_dir)
                except:
                    pass  # Ignore cleanup errors
    
    def is_model_loaded(self) -> bool:
        """Check if the Whisper model is currently loaded."""
        return self._pipe is not None
    
    def warmup(self):
        """
        Pre-load the model to avoid delay on first request.
        Call this during application startup if desired.
        """
        logger.info("Warming up SpeechToTextService...")
        self._load_model()
        logger.info("SpeechToTextService warmup complete!")