Spaces:

Krishna346
/

Youtube-summarizer-api

Running

Youtube-summarizer-api / services /speech_to_text.py

bskrishna2006

Initial backend deployment

dfbb2da 7 days ago

10.3 kB

	"""
	Speech-to-Text Service using OpenAI Whisper (Local Model)

	This service provides LOCAL speech-to-text transcription using Whisper.
	NO API CALLS - everything runs on your machine for FREE!

	Features:
	- Extracts audio from YouTube videos using yt-dlp
	- Transcribes audio using Whisper (small model by default)
	- Detects the language of the audio automatically
	- Returns both transcript and detected language

	Requirements:
	- FFmpeg must be installed on the system
	- Sufficient RAM (~2GB for whisper-small)
	"""

	import os
	import tempfile
	import logging
	from typing import Optional, Tuple

	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import yt_dlp

	from config import (
	WHISPER_MODEL,
	AUDIO_FORMAT,
	AUDIO_SAMPLE_RATE,
	normalize_whisper_lang,
	)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def get_ffmpeg_path() -> Optional[str]:
	"""
	Get the path to FFmpeg executable directory.
	Uses static-ffmpeg which provides both ffmpeg and ffprobe.
	Falls back to system PATH or imageio-ffmpeg.
	"""
	import shutil

	# Check if ffmpeg AND ffprobe are in system PATH
	ffmpeg_path = shutil.which("ffmpeg")
	ffprobe_path = shutil.which("ffprobe")
	if ffmpeg_path and ffprobe_path:
	logger.info(f"Using system FFmpeg: {ffmpeg_path}")
	return os.path.dirname(ffmpeg_path)

	# Try static-ffmpeg (provides both ffmpeg and ffprobe)
	try:
	import static_ffmpeg
	# This downloads ffmpeg/ffprobe if not already present
	ffmpeg_path, ffprobe_path = static_ffmpeg.run.get_or_fetch_platform_executables_else_raise()
	if ffmpeg_path and os.path.exists(ffmpeg_path):
	ffmpeg_dir = os.path.dirname(ffmpeg_path)
	logger.info(f"Using static-ffmpeg: {ffmpeg_dir}")
	return ffmpeg_dir
	except ImportError:
	logger.warning("static-ffmpeg not installed")
	except Exception as e:
	logger.warning(f"static-ffmpeg error: {e}")

	# Fall back to imageio-ffmpeg (only has ffmpeg, not ffprobe)
	try:
	import imageio_ffmpeg
	ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
	if ffmpeg_path and os.path.exists(ffmpeg_path):
	logger.warning("Using imageio-ffmpeg (may not have ffprobe)")
	return os.path.dirname(ffmpeg_path)
	except ImportError:
	pass

	return None


	class SpeechToTextService:
	"""
	Service for converting speech to text using local Whisper model.

	The model is lazily loaded on first use to save memory during startup.
	All processing happens locally - no API costs!
	"""

	def __init__(self, model_name: str = WHISPER_MODEL):
	"""
	Initialize the speech-to-text service.

	Args:
	model_name: Hugging Face model identifier for Whisper
	"""
	self.model_name = model_name
	self._pipe = None # Lazy-loaded pipeline
	self._device = "cuda" if torch.cuda.is_available() else "cpu"
	self._torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	logger.info(f"SpeechToTextService initialized (device: {self._device})")

	def _load_model(self):
	"""
	Load the Whisper model and processor.
	Called lazily on first transcription request.
	"""
	if self._pipe is not None:
	return

	logger.info(f"Loading Whisper model: {self.model_name}")
	logger.info("This may take a few minutes on first run (downloading model)...")

	try:
	# Load model with optimizations for CPU/GPU
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	self.model_name,
	torch_dtype=self._torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	model.to(self._device)

	# Load processor
	processor = AutoProcessor.from_pretrained(self.model_name)

	# Create pipeline for easy inference
	self._pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=self._torch_dtype,
	device=self._device,
	return_timestamps=False
	)

	logger.info("Whisper model loaded successfully!")

	except Exception as e:
	logger.error(f"Failed to load Whisper model: {e}")
	raise Exception(f"Could not load Whisper model: {str(e)}")

	def extract_audio_from_youtube(self, url: str) -> str:
	"""
	Extract audio from a YouTube video.

	Args:
	url: YouTube video URL

	Returns:
	Path to the extracted audio file (WAV format)

	Raises:
	Exception: If audio extraction fails
	"""
	logger.info(f"Extracting audio from: {url}")

	# Get FFmpeg path (system or imageio-ffmpeg)
	ffmpeg_path = get_ffmpeg_path()
	if not ffmpeg_path:
	raise Exception("FFmpeg not found. Please install FFmpeg or run: pip install imageio-ffmpeg")

	logger.info(f"Using FFmpeg: {ffmpeg_path}")

	# Create temporary directory for audio file
	temp_dir = tempfile.mkdtemp()
	output_template = os.path.join(temp_dir, "audio.%(ext)s")

	ydl_opts = {
	"format": "bestaudio/best",
	"outtmpl": output_template,
	"postprocessors": [{
	"key": "FFmpegExtractAudio",
	"preferredcodec": AUDIO_FORMAT,
	"preferredquality": "192",
	}],
	"ffmpeg_location": ffmpeg_path, # yt-dlp needs the directory containing ffmpeg and ffprobe
	"quiet": True,
	"no_warnings": True,
	}

	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])

	# Find the extracted audio file
	audio_path = os.path.join(temp_dir, f"audio.{AUDIO_FORMAT}")

	if not os.path.exists(audio_path):
	raise Exception("Audio file was not created")

	logger.info(f"Audio extracted to: {audio_path}")
	return audio_path

	except Exception as e:
	logger.error(f"Audio extraction failed: {e}")
	raise Exception(f"Could not extract audio: {str(e)}")

	def transcribe_audio(self, audio_path: str) -> dict:
	"""
	Transcribe an audio file using Whisper.

	Args:
	audio_path: Path to the audio file

	Returns:
	Dictionary with:
	- text: The transcribed text
	- language: Detected language code (normalized)
	- raw_language: Original Whisper language code
	"""
	# Ensure model is loaded
	self._load_model()

	logger.info(f"Transcribing audio: {audio_path}")

	try:
	# Run transcription
	result = self._pipe(
	audio_path,
	generate_kwargs={
	"task": "transcribe",
	"language": None, # Auto-detect language
	}
	)

	# Extract text
	text = result.get("text", "").strip()

	if not text:
	raise Exception("Transcription produced empty text")

	# Try to get detected language from the model
	# Note: Whisper pipeline may not always return language info
	raw_language = "en" # Default to English

	# Normalize the language code
	language = normalize_whisper_lang(raw_language)

	logger.info(f"Transcription complete. Language: {language}")

	return {
	"text": text,
	"language": language,
	"raw_language": raw_language
	}

	except Exception as e:
	logger.error(f"Transcription failed: {e}")
	raise Exception(f"Could not transcribe audio: {str(e)}")

	def transcribe_youtube_video(self, url: str) -> dict:
	"""
	Full pipeline: Extract audio from YouTube and transcribe it.

	Args:
	url: YouTube video URL

	Returns:
	Dictionary with:
	- text: The transcribed text
	- language: Detected language code
	- word_count: Number of words in transcript
	"""
	audio_path = None

	try:
	# Step 1: Extract audio
	audio_path = self.extract_audio_from_youtube(url)

	# Step 2: Transcribe
	result = self.transcribe_audio(audio_path)

	# Add word count
	result["word_count"] = len(result["text"].split())

	return result

	finally:
	# Cleanup: Remove temporary audio file
	if audio_path and os.path.exists(audio_path):
	try:
	os.remove(audio_path)
	# Also remove the parent temp directory
	temp_dir = os.path.dirname(audio_path)
	if os.path.exists(temp_dir):
	os.rmdir(temp_dir)
	except:
	pass # Ignore cleanup errors

	def is_model_loaded(self) -> bool:
	"""Check if the Whisper model is currently loaded."""
	return self._pipe is not None

	def warmup(self):
	"""
	Pre-load the model to avoid delay on first request.
	Call this during application startup if desired.
	"""
	logger.info("Warming up SpeechToTextService...")
	self._load_model()
	logger.info("SpeechToTextService warmup complete!")