23f3003322 commited on
Commit
c19b958
Β·
1 Parent(s): 81a2360

media transcriber completed

Browse files
app/orchestrator/actions/action_executor.py CHANGED
@@ -150,27 +150,21 @@ class ActionExecutor:
150
  return results
151
 
152
  async def _handle_ocr(self, urls: List[str]) -> List[str]:
153
- """Handle OCR on images"""
154
- logger.info(f"πŸ–ΌοΈ Processing OCR URLs")
155
-
156
  results = []
157
 
158
  for url in urls:
159
- if not self._is_image(url):
160
- continue
161
 
162
- try:
163
- ocr_result = await self.image_processor.extract_text_from_image(url)
164
- results.append(
165
- f"\n\nText extracted from image {url}:\n{ocr_result['extracted_text']}"
166
- )
167
-
168
- except Exception as e:
169
- logger.error(f"Failed to OCR {url}: {e}")
170
- results.append(f"\n\n[Failed to extract text from {url}: {str(e)}]")
171
 
172
  return results
173
-
 
174
  async def _handle_navigation(self, urls: List[str]) -> List[str]:
175
  """Handle navigation to additional URLs"""
176
  logger.info(f"🌐 Processing navigation URLs")
 
150
  return results
151
 
152
  async def _handle_ocr(self, urls: List[str]) -> List[str]:
 
 
 
153
  results = []
154
 
155
  for url in urls:
156
+ ocr_result = await self.image_processor.extract_text_from_image(url)
 
157
 
158
+ if ocr_result['status'] == 'success':
159
+ results.append(f"\nText from {url}:\n{ocr_result['extracted_text']}")
160
+ elif ocr_result['status'] == 'unavailable':
161
+ results.append(f"\n[Image at {url} - OCR not configured]")
162
+ else:
163
+ results.append(f"\n[OCR failed for {url}]")
 
 
 
164
 
165
  return results
166
+
167
+
168
  async def _handle_navigation(self, urls: List[str]) -> List[str]:
169
  """Handle navigation to additional URLs"""
170
  logger.info(f"🌐 Processing navigation URLs")
app/orchestrator/actions/media_transcriber.py CHANGED
@@ -1,107 +1,245 @@
1
  """
2
- Media Transcriber
3
- Handles audio and video transcription
4
  """
5
 
6
  import httpx
7
- from typing import Dict, Any
 
 
 
8
 
9
  from app.core.config import settings
10
  from app.core.logging import get_logger
11
- from app.core.exceptions import TaskProcessingError
12
 
13
  logger = get_logger(__name__)
14
 
15
 
16
  class MediaTranscriber:
17
  """
18
- Transcribes audio and video files
19
- Uses external APIs (OpenAI Whisper, etc.)
 
 
20
  """
21
 
22
- def __init__(self):
23
  """Initialize media transcriber"""
24
- logger.debug("MediaTranscriber initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- async def transcribe_audio(self, url: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
27
  """
28
  Transcribe audio file
 
 
 
29
 
30
- Args:
31
- url: URL to audio file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- Returns:
34
- Dict with transcription result:
35
- {
36
- 'url': str,
37
- 'transcription': str,
38
- 'language': str,
39
- 'duration': float (if available)
 
 
 
 
 
 
 
40
  }
 
 
 
41
  """
42
- logger.info(f"🎀 Transcribing audio from: {url}")
43
-
44
- # For now, return a placeholder
45
- # In production, you would:
46
- # 1. Download the audio file
47
- # 2. Send to transcription API (Whisper, AssemblyAI, etc.)
48
- # 3. Return the transcription
49
-
50
- logger.warning(
51
- "⚠️ Audio transcription not fully implemented. "
52
- "Returning placeholder. Integrate with Whisper API for production."
53
- )
54
 
55
  return {
56
  'url': url,
57
- 'transcription': f"[Audio transcription placeholder for {url}. "
58
- "Integrate with OpenAI Whisper or AssemblyAI API.]",
 
 
 
 
 
59
  'language': 'unknown',
60
- 'status': 'placeholder'
 
 
61
  }
62
 
63
- async def transcribe_video(self, url: str) -> Dict[str, Any]:
64
- """
65
- Transcribe video file (extracts audio and transcribes)
66
-
67
- Args:
68
- url: URL to video file
 
 
 
 
69
 
70
- Returns:
71
- Dict with transcription result
72
- """
73
- logger.info(f"🎬 Transcribing video from: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- logger.warning(
76
- "⚠️ Video transcription not fully implemented. "
77
- "Returning placeholder."
 
 
78
  )
79
 
 
 
80
  return {
81
- 'url': url,
82
- 'transcription': f"[Video transcription placeholder for {url}. "
83
- "Extract audio and use Whisper API.]",
84
- 'language': 'unknown',
85
- 'status': 'placeholder'
86
  }
87
 
88
- async def _transcribe_with_whisper(self, audio_file_path: str) -> str:
89
- """
90
- Transcribe using OpenAI Whisper API (placeholder implementation)
 
 
 
 
91
 
92
- Args:
93
- audio_file_path: Path to audio file
94
-
95
- Returns:
96
- str: Transcription text
97
- """
98
- # Placeholder for Whisper API integration
99
- # Actual implementation would use OpenAI API:
100
- #
101
- # import openai
102
- # with open(audio_file_path, 'rb') as f:
103
- # transcript = openai.Audio.transcribe("whisper-1", f)
104
- # return transcript['text']
105
 
106
- logger.warning("Whisper API integration needed for actual transcription")
107
- return "[Transcription unavailable - Whisper API not configured]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Media Transcriber - HF Spaces Free Tier Optimized
3
+ Audio-only support (no ffmpeg needed)
4
  """
5
 
6
  import httpx
7
+ import tempfile
8
+ import os
9
+ from typing import Dict, Any, Optional
10
+ from pathlib import Path
11
 
12
  from app.core.config import settings
13
  from app.core.logging import get_logger
 
14
 
15
  logger = get_logger(__name__)
16
 
17
 
18
  class MediaTranscriber:
19
  """
20
+ Audio transcriber optimized for HF Spaces free tier
21
+ - Supports audio files: .mp3, .wav, .m4a, .ogg, .flac
22
+ - Video files return helpful error message
23
+ - No ffmpeg dependency required
24
  """
25
 
26
+ def __init__(self, timeout: int = 300):
27
  """Initialize media transcriber"""
28
+ self.timeout = timeout
29
+ self.temp_dir = tempfile.mkdtemp(prefix='audio_transcription_')
30
+
31
+ self.download_headers = {
32
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
33
+ 'Accept': 'audio/*,*/*;q=0.8'
34
+ }
35
+
36
+ self.faster_whisper_available = self._check_faster_whisper()
37
+ self.aipipe_available = self._check_aipipe()
38
+
39
+ logger.info(
40
+ f"MediaTranscriber initialized (audio-only) | "
41
+ f"faster-whisper: {'βœ“' if self.faster_whisper_available else 'βœ—'} | "
42
+ f"AIPipe: {'βœ“' if self.aipipe_available else 'βœ—'}"
43
+ )
44
 
45
+ def _check_faster_whisper(self) -> bool:
46
+ """Check if faster-whisper is available"""
47
+ try:
48
+ from faster_whisper import WhisperModel
49
+ return True
50
+ except ImportError:
51
+ return False
52
+
53
+ def _check_aipipe(self) -> bool:
54
+ """Check if AIPipe is configured"""
55
+ return settings.is_llm_configured()
56
+
57
+ async def transcribe_audio(self, url: str, language: Optional[str] = None) -> Dict[str, Any]:
58
  """
59
  Transcribe audio file
60
+ Supports: .mp3, .wav, .m4a, .ogg, .flac, .aac
61
+ """
62
+ logger.info(f"🎀 Transcribing audio: {url}")
63
 
64
+ try:
65
+ # Check if it's actually an audio file
66
+ if not self._is_audio_file(url):
67
+ logger.warning(f"Not an audio file: {url}")
68
+ return {
69
+ 'url': url,
70
+ 'transcription': (
71
+ f'[Only audio files supported. Got: {url}. '
72
+ f'Supported: .mp3, .wav, .m4a, .ogg, .flac, .aac]'
73
+ ),
74
+ 'status': 'unsupported_format',
75
+ 'method': 'none',
76
+ 'language': 'unknown'
77
+ }
78
+
79
+ # Download audio
80
+ audio_path = await self._download_audio(url)
81
+ if not audio_path:
82
+ raise Exception("Failed to download audio")
83
+
84
+ # Transcribe
85
+ if self.faster_whisper_available:
86
+ result = await self._transcribe_with_faster_whisper(audio_path, language)
87
+ elif self.aipipe_available:
88
+ result = await self._transcribe_with_aipipe(audio_path, language)
89
+ else:
90
+ result = {
91
+ 'transcription': f'[Transcription unavailable. Install faster-whisper or set AIPIPE_TOKEN]',
92
+ 'language': 'unknown',
93
+ 'method': 'none',
94
+ 'status': 'unavailable'
95
+ }
96
 
97
+ result['url'] = url
98
+ logger.info(f"βœ… Transcription complete | Method: {result['method']}")
99
+
100
+ return result
101
+
102
+ except Exception as e:
103
+ logger.error(f"❌ Transcription failed: {e}", exc_info=True)
104
+ return {
105
+ 'url': url,
106
+ 'transcription': f'[Transcription failed: {str(e)}]',
107
+ 'status': 'error',
108
+ 'method': 'none', # ← ADD THIS
109
+ 'language': 'unknown', # ← ADD THIS
110
+ 'error': str(e)
111
  }
112
+
113
+
114
+ async def transcribe_video(self, url: str, language: Optional[str] = None) -> Dict[str, Any]:
115
  """
116
+ Video transcription not supported on HF Spaces free tier
117
+ Returns helpful error message
118
+ """
119
+ logger.warning(f"⚠️ Video transcription not supported: {url}")
 
 
 
 
 
 
 
 
120
 
121
  return {
122
  'url': url,
123
+ 'transcription': (
124
+ f'[Video transcription not supported on HF Spaces free tier. '
125
+ f'Video URL: {url}. '
126
+ f'To transcribe videos: '
127
+ f'1) Extract audio locally and upload as .mp3, or '
128
+ f'2) Use a service that provides direct audio URLs.]'
129
+ ),
130
  'language': 'unknown',
131
+ 'method': 'none',
132
+ 'status': 'video_not_supported',
133
+ 'note': 'HF Spaces free tier limitation - no ffmpeg available'
134
  }
135
 
136
+ def _is_audio_file(self, url: str) -> bool:
137
+ """Check if URL is an audio file"""
138
+ audio_extensions = ['.mp3', '.wav', '.m4a', '.ogg', '.flac', '.aac']
139
+ url_lower = url.lower()
140
+ return any(url_lower.endswith(ext) for ext in audio_extensions)
141
+
142
+ async def _download_audio(self, url: str) -> Optional[str]:
143
+ """Download audio file"""
144
+ try:
145
+ logger.info(f"Downloading audio: {url}")
146
 
147
+ async with httpx.AsyncClient(
148
+ timeout=self.timeout,
149
+ follow_redirects=True,
150
+ headers=self.download_headers
151
+ ) as client:
152
+ response = await client.get(url)
153
+ response.raise_for_status()
154
+
155
+ # Save to temp
156
+ extension = Path(url.split('?')[0]).suffix or '.mp3'
157
+ file_path = os.path.join(self.temp_dir, f"audio_{hash(url)}{extension}")
158
+
159
+ with open(file_path, 'wb') as f:
160
+ f.write(response.content)
161
+
162
+ logger.info(f"βœ… Downloaded: {len(response.content) / (1024*1024):.2f} MB")
163
+ return file_path
164
+
165
+ except Exception as e:
166
+ logger.error(f"Download failed: {e}")
167
+ return None
168
+
169
+ async def _transcribe_with_faster_whisper(
170
+ self,
171
+ audio_path: str,
172
+ language: Optional[str] = None
173
+ ) -> Dict[str, Any]:
174
+ """Transcribe with faster-whisper (local, no API key)"""
175
+ from faster_whisper import WhisperModel
176
+
177
+ if not hasattr(self, '_whisper_model'):
178
+ logger.info("Loading faster-whisper model...")
179
+ model_size = os.getenv('WHISPER_MODEL_SIZE', 'base')
180
+ self._whisper_model = WhisperModel(
181
+ model_size,
182
+ device="cpu",
183
+ compute_type="int8"
184
+ )
185
+ logger.info(f"βœ“ Model '{model_size}' loaded")
186
 
187
+ segments, info = self._whisper_model.transcribe(
188
+ audio_path,
189
+ language=language,
190
+ beam_size=5,
191
+ vad_filter=True
192
  )
193
 
194
+ transcription = ' '.join([s.text for s in segments]).strip()
195
+
196
  return {
197
+ 'transcription': transcription,
198
+ 'language': info.language if hasattr(info, 'language') else 'unknown',
199
+ 'duration': info.duration if hasattr(info, 'duration') else None,
200
+ 'method': 'faster_whisper',
201
+ 'status': 'success'
202
  }
203
 
204
+ async def _transcribe_with_aipipe(
205
+ self,
206
+ audio_path: str,
207
+ language: Optional[str] = None
208
+ ) -> Dict[str, Any]:
209
+ """Transcribe with AIPipe API"""
210
+ logger.info("Transcribing with AIPipe...")
211
 
212
+ with open(audio_path, 'rb') as f:
213
+ audio_data = f.read()
214
+
215
+ files = {'file': (os.path.basename(audio_path), audio_data, 'audio/mpeg')}
216
+ data = {'model': 'gpt-4o-audio-preview'}
 
 
 
 
 
 
 
 
217
 
218
+ if language:
219
+ data['language'] = language
220
+
221
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
222
+ response = await client.post(
223
+ f"{settings.AIPIPE_BASE_URL}/audio/transcriptions",
224
+ headers={'Authorization': f'Bearer {settings.AIPIPE_TOKEN}'},
225
+ files=files,
226
+ data=data
227
+ )
228
+ response.raise_for_status()
229
+ result = response.json()
230
+
231
+ return {
232
+ 'transcription': result.get('text', ''),
233
+ 'language': result.get('language', 'unknown'),
234
+ 'duration': result.get('duration'),
235
+ 'method': 'aipipe',
236
+ 'status': 'success'
237
+ }
238
+
239
+ def cleanup(self):
240
+ """Clean up temp files"""
241
+ try:
242
+ import shutil
243
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
244
+ except Exception as e:
245
+ logger.warning(f"Cleanup failed: {e}")
requirements.txt CHANGED
@@ -24,6 +24,8 @@ Pillow
24
  # PDF Processing
25
  PyPDF2==3.0.1
26
 
 
 
27
  # Data Processing
28
  # pandas==2.2.0
29
  # numpy==1.26.3
 
24
  # PDF Processing
25
  PyPDF2==3.0.1
26
 
27
+ faster-whisper
28
+
29
  # Data Processing
30
  # pandas==2.2.0
31
  # numpy==1.26.3
test/media_transcriber.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test Media Transcriber - Audio Only Version
3
+ Tests for HF Spaces free tier (no ffmpeg)
4
+ """
5
+
6
+ import sys
7
+ import os
8
+
9
+ ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10
+ sys.path.append(ROOT)
11
+
12
+ import asyncio
13
+ from app.orchestrator.actions.media_transcriber import MediaTranscriber
14
+ from app.core.logging import setup_logging, get_logger
15
+
16
+ setup_logging()
17
+ logger = get_logger(__name__)
18
+
19
+ async def test_speech_detection():
20
+ """Test transcription with real internet audio containing speech"""
21
+
22
+ print("\n" + "=" * 60)
23
+ print("Test: Speech Detection (Real World Audio)")
24
+ print("=" * 60)
25
+
26
+ transcriber = MediaTranscriber()
27
+
28
+ # Public domain/open source audio samples with speech
29
+ speech_samples = [
30
+ {
31
+ 'url': 'https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav',
32
+ 'description': 'Open Speech Repository - American English',
33
+ 'format': '.wav',
34
+ 'duration': '~3 seconds',
35
+ 'expected_type': 'clear speech',
36
+ 'source': 'VoIP Troubleshooter Open Speech Repository'
37
+ },
38
+ {
39
+ 'url': 'https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0011_8k.wav',
40
+ 'description': 'Open Speech Repository - Short phrase',
41
+ 'format': '.wav',
42
+ 'duration': '~3 seconds',
43
+ 'expected_type': 'clear speech',
44
+ 'source': 'VoIP Troubleshooter Open Speech Repository'
45
+ },
46
+ {
47
+ 'url': 'https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0012_8k.wav',
48
+ 'description': 'Open Speech Repository - Another phrase',
49
+ 'format': '.wav',
50
+ 'duration': '~3 seconds',
51
+ 'expected_type': 'clear speech',
52
+ 'source': 'VoIP Troubleshooter Open Speech Repository'
53
+ }
54
+ ]
55
+
56
+ print("\nπŸŽ™οΈ Testing with real-world speech samples")
57
+ print("Source: Open Speech Repository (Public Domain)")
58
+ print()
59
+
60
+ success_count = 0
61
+ speech_detected_count = 0
62
+
63
+ for i, sample in enumerate(speech_samples, 1):
64
+ print(f"{'-' * 60}")
65
+ print(f"Test {i}/{len(speech_samples)}")
66
+ print(f"Audio: {sample['description']}")
67
+ print(f"URL: {sample['url']}")
68
+ print(f"Duration: {sample['duration']}")
69
+ print(f"Expected: {sample['expected_type']}")
70
+ print(f"{'-' * 60}")
71
+
72
+ try:
73
+ result = await transcriber.transcribe_audio(sample['url'])
74
+
75
+ status = result.get('status', 'unknown')
76
+ method = result.get('method', 'none')
77
+
78
+ print(f"\nβœ“ Status: {status}")
79
+ print(f"βœ“ Method: {method}")
80
+
81
+ if status == 'success':
82
+ language = result.get('language', 'unknown')
83
+ duration = result.get('duration')
84
+ transcription = result.get('transcription', '').strip()
85
+
86
+ print(f"βœ“ Language: {language}")
87
+ if duration:
88
+ print(f"βœ“ Duration: {duration:.2f} seconds")
89
+
90
+ word_count = len(transcription.split()) if transcription else 0
91
+ print(f"βœ“ Word count: {word_count}")
92
+
93
+ if word_count > 0:
94
+ print(f"\nβœ… SPEECH DETECTED!")
95
+ print(f"\nπŸ“ Transcribed text:")
96
+ print(f' "{transcription}"')
97
+ speech_detected_count += 1
98
+ else:
99
+ print(f"\n⚠️ No words detected")
100
+
101
+ success_count += 1
102
+
103
+ elif status == 'unavailable':
104
+ print("\n⚠️ Transcription backend not available")
105
+ print("πŸ’‘ Install: pip install faster-whisper")
106
+ break
107
+
108
+ elif status == 'error':
109
+ error_msg = result.get('error', 'Unknown')
110
+ print(f"\n❌ Error: {error_msg[:150]}")
111
+
112
+ # Check error type
113
+ if any(x in error_msg.lower() for x in ['network', 'dns', 'timeout', 'nodename']):
114
+ print(" (Network error - trying next sample...)")
115
+ continue
116
+ else:
117
+ print(" (Non-network error - skipping remaining tests)")
118
+ break
119
+
120
+ except Exception as e:
121
+ print(f"\n❌ Exception: {str(e)[:150]}")
122
+ logger.error(f"Test {i} failed", exc_info=True)
123
+ continue
124
+
125
+ print()
126
+
127
+ # Summary
128
+ print("=" * 60)
129
+ print("SPEECH DETECTION SUMMARY")
130
+ print("=" * 60)
131
+
132
+ if success_count > 0:
133
+ print(f"βœ… {success_count}/{len(speech_samples)} samples processed")
134
+ print(f"πŸŽ™οΈ {speech_detected_count}/{success_count} detected speech")
135
+
136
+ if speech_detected_count > 0:
137
+ print(f"\nπŸŽ‰ SUCCESS! Real-world speech transcription working")
138
+ print(f" System successfully transcribed human speech from internet audio")
139
+ else:
140
+ print(f"\n⚠️ Processed but no speech detected")
141
+ else:
142
+ if not (transcriber.faster_whisper_available or transcriber.aipipe_available):
143
+ print("⚠️ No transcription backend installed")
144
+ print(" Install: pip install faster-whisper")
145
+ else:
146
+ print("⚠️ Audio files unavailable or network issue")
147
+ print(" The transcriber itself is properly configured")
148
+
149
+ print("=" * 60)
150
+
151
+ return transcriber
152
+
153
+
154
+ async def test_small_audio_files():
155
+ """Test with small audio files suitable for quick tasks"""
156
+
157
+ print("\n" + "=" * 60)
158
+ print("Test 1: Small Audio Files (< 30 seconds)")
159
+ print("=" * 60)
160
+
161
+ transcriber = MediaTranscriber()
162
+
163
+ # Small, reliable test audio files
164
+ test_audios = [
165
+ {
166
+ 'url': 'https://actions.google.com/sounds/v1/alarms/beep_short.ogg',
167
+ 'description': 'Very short beep (< 1 second)',
168
+ 'format': '.ogg',
169
+ 'expected_duration': '< 1 sec',
170
+ 'expected_text': 'Instrumental/beep (no speech)'
171
+ },
172
+ {
173
+ 'url': 'https://actions.google.com/sounds/v1/cartoon/cartoon_boing.ogg',
174
+ 'description': 'Short sound effect (< 2 seconds)',
175
+ 'format': '.ogg',
176
+ 'expected_duration': '~2 sec',
177
+ 'expected_text': 'Sound effect (no speech)'
178
+ }
179
+ ]
180
+
181
+ print("\nπŸ“ Testing with small audio samples suitable for 3-minute tasks\n")
182
+
183
+ success_count = 0
184
+
185
+ for i, test_audio in enumerate(test_audios, 1):
186
+ print(f"{'-' * 60}")
187
+ print(f"Test {i}/{len(test_audios)}: {test_audio['description']}")
188
+ print(f"URL: {test_audio['url']}")
189
+ print(f"Format: {test_audio['format']}")
190
+ print(f"Expected duration: {test_audio['expected_duration']}")
191
+ print(f"Expected: {test_audio['expected_text']}")
192
+ print(f"{'-' * 60}")
193
+
194
+ try:
195
+ result = await transcriber.transcribe_audio(test_audio['url'])
196
+
197
+ status = result.get('status', 'unknown')
198
+ method = result.get('method', 'none')
199
+
200
+ print(f"\nβœ“ Status: {status}")
201
+ print(f"βœ“ Method: {method}")
202
+
203
+ if status == 'success':
204
+ print(f"βœ… Transcription successful!")
205
+
206
+ language = result.get('language', 'unknown')
207
+ print(f"βœ“ Language: {language}")
208
+
209
+ duration = result.get('duration')
210
+ if duration:
211
+ print(f"βœ“ Duration: {duration:.2f} seconds")
212
+
213
+ transcription = result.get('transcription', '')
214
+ print(f"βœ“ Text length: {len(transcription)} chars")
215
+
216
+ if transcription.strip():
217
+ print(f"\nπŸ“ Transcription:")
218
+ print(f" {transcription[:200]}")
219
+ else:
220
+ print(f"\nπŸ“ No speech detected (expected for sound effects)")
221
+
222
+ success_count += 1
223
+
224
+ elif status == 'unavailable':
225
+ print("⚠️ Transcription backend not available")
226
+ print("\nπŸ’‘ To enable transcription:")
227
+ print(" 1. Install: pip install faster-whisper")
228
+ print(" 2. Or set AIPIPE_TOKEN in .env")
229
+ break # No point testing other files
230
+
231
+ elif status == 'error':
232
+ error_msg = result.get('error', 'Unknown')
233
+ print(f"❌ Error: {error_msg[:100]}")
234
+
235
+ # Check if it's a network error
236
+ if any(x in error_msg.lower() for x in ['network', 'dns', 'nodename', 'timeout']):
237
+ print(" ℹ️ Network error - URL may be temporarily unavailable")
238
+
239
+ except Exception as e:
240
+ print(f"❌ Exception: {str(e)[:100]}")
241
+ logger.error(f"Test {i} failed", exc_info=True)
242
+
243
+ print()
244
+
245
+ # Summary
246
+ print("=" * 60)
247
+ if success_count > 0:
248
+ print(f"βœ… {success_count}/{len(test_audios)} audio files transcribed successfully")
249
+ elif transcriber.faster_whisper_available or transcriber.aipipe_available:
250
+ print("⚠️ Transcription available but test files failed to download")
251
+ print(" (Network issue - the transcriber itself is working)")
252
+ else:
253
+ print("ℹ️ No transcription backend installed")
254
+ print("=" * 60)
255
+
256
+ return transcriber
257
+
258
+
259
+ async def test_video_rejection():
260
+ """Test that video files are rejected gracefully"""
261
+
262
+ print("\n" + "=" * 60)
263
+ print("Test 2: Video File Rejection (Audio-Only Mode)")
264
+ print("=" * 60)
265
+
266
+ transcriber = MediaTranscriber()
267
+
268
+ # Test video URL
269
+ test_video = {
270
+ 'url': 'https://example.com/sample-video.mp4',
271
+ 'description': 'Sample video file'
272
+ }
273
+
274
+ print(f"\nπŸ“Ή Testing: {test_video['description']}")
275
+ print(f"URL: {test_video['url']}")
276
+ print(f"Expected: Rejection with helpful message")
277
+ print("-" * 60)
278
+
279
+ result = await transcriber.transcribe_video(test_video['url'])
280
+
281
+ status = result.get('status', 'unknown')
282
+ print(f"\nβœ“ Status: {status}")
283
+
284
+ if status == 'video_not_supported':
285
+ print(f"βœ… Video correctly rejected (audio-only mode)")
286
+ print(f"\nπŸ“ Message shown to user:")
287
+ print(f" {result.get('transcription', '')[:200]}...")
288
+ else:
289
+ print(f"⚠️ Unexpected status: {status}")
290
+
291
+ return transcriber
292
+
293
+
294
+ async def test_format_detection():
295
+ """Test audio format detection"""
296
+
297
+ print("\n" + "=" * 60)
298
+ print("Test 3: Format Detection & Validation")
299
+ print("=" * 60)
300
+
301
+ transcriber = MediaTranscriber()
302
+
303
+ test_cases = [
304
+ {
305
+ 'url': 'https://example.com/file.mp3',
306
+ 'expected': 'audio',
307
+ 'description': 'MP3 audio file'
308
+ },
309
+ {
310
+ 'url': 'https://example.com/file.wav',
311
+ 'expected': 'audio',
312
+ 'description': 'WAV audio file'
313
+ },
314
+ {
315
+ 'url': 'https://example.com/file.m4a',
316
+ 'expected': 'audio',
317
+ 'description': 'M4A audio file'
318
+ },
319
+ {
320
+ 'url': 'https://example.com/image.png',
321
+ 'expected': 'unsupported',
322
+ 'description': 'PNG image (not audio)'
323
+ },
324
+ {
325
+ 'url': 'https://example.com/doc.pdf',
326
+ 'expected': 'unsupported',
327
+ 'description': 'PDF document (not audio)'
328
+ }
329
+ ]
330
+
331
+ print("\nπŸ” Testing format detection for various file types:\n")
332
+
333
+ for i, test in enumerate(test_cases, 1):
334
+ is_audio = transcriber._is_audio_file(test['url'])
335
+ detected = 'audio' if is_audio else 'unsupported'
336
+
337
+ if detected == test['expected']:
338
+ status = "βœ…"
339
+ else:
340
+ status = "❌"
341
+
342
+ print(f"{status} {test['description']}")
343
+ print(f" URL: {test['url']}")
344
+ print(f" Detected: {detected} | Expected: {test['expected']}")
345
+ print()
346
+
347
+ return transcriber
348
+
349
+
350
+ async def test_backend_check():
351
+ """Test backend availability"""
352
+
353
+ print("\n" + "=" * 60)
354
+ print("Test 4: Transcription Backend Status")
355
+ print("=" * 60)
356
+
357
+ transcriber = MediaTranscriber()
358
+
359
+ print("\nπŸ”§ Checking available backends:\n")
360
+
361
+ # Check faster-whisper
362
+ if transcriber.faster_whisper_available:
363
+ print("βœ… faster-whisper: Available")
364
+ print(" β†’ Local transcription (CPU)")
365
+ print(" β†’ No API key needed")
366
+ print(" β†’ Free, unlimited")
367
+ print(" β†’ Model: base (~150MB)")
368
+ print(" β†’ Speed: ~20 seconds per minute of audio")
369
+ else:
370
+ print("❌ faster-whisper: Not installed")
371
+ print(" β†’ Install: pip install faster-whisper")
372
+
373
+ print()
374
+
375
+ # Check AIPipe
376
+ if transcriber.aipipe_available:
377
+ print("βœ… AIPipe: Configured")
378
+ print(" β†’ Cloud transcription")
379
+ print(" β†’ Uses AIPIPE_TOKEN")
380
+ print(" β†’ Model: gpt-4o-audio-preview")
381
+ print(" β†’ Speed: ~5 seconds per minute of audio")
382
+ else:
383
+ print("❌ AIPipe: Not configured")
384
+ print(" β†’ Set AIPIPE_TOKEN in .env")
385
+
386
+ print()
387
+
388
+ # Recommendation
389
+ if transcriber.faster_whisper_available:
390
+ print("πŸ’‘ Recommendation: Using faster-whisper (local, free)")
391
+ elif transcriber.aipipe_available:
392
+ print("πŸ’‘ Recommendation: Using AIPipe (cloud, paid)")
393
+ else:
394
+ print("⚠️ No transcription backend available")
395
+ print("\nπŸ“¦ Quick Setup:")
396
+ print(" pip install faster-whisper")
397
+
398
+ return transcriber
399
+
400
+
401
+ async def test_performance_estimate():
402
+ """Show performance estimates for typical task sizes"""
403
+
404
+ print("\n" + "=" * 60)
405
+ print("Test 5: Performance Estimates")
406
+ print("=" * 60)
407
+
408
+ transcriber = MediaTranscriber()
409
+
410
+ # Typical task scenarios
411
+ scenarios = [
412
+ {'duration': 10, 'description': 'Very short clip'},
413
+ {'duration': 30, 'description': 'Short instruction'},
414
+ {'duration': 60, 'description': 'One minute audio'},
415
+ {'duration': 120, 'description': 'Two minute recording'},
416
+ {'duration': 180, 'description': 'Maximum task audio (3 min)'}
417
+ ]
418
+
419
+ print("\n⏱️ Estimated transcription times for HF Spaces free tier:\n")
420
+ print(f"{'Audio Duration':<20} | {'faster-whisper':<20} | {'AIPipe':<20}")
421
+ print("-" * 65)
422
+
423
+ for scenario in scenarios:
424
+ duration = scenario['duration']
425
+ desc = scenario['description']
426
+
427
+ # Estimates (conservative for free tier CPU)
428
+ local_time = duration * 0.3 # ~30% of audio duration
429
+ cloud_time = duration * 0.1 # ~10% of audio duration
430
+
431
+ print(f"{duration}s ({desc:<15}) | ~{local_time:.0f}s | ~{cloud_time:.0f}s")
432
+
433
+ print()
434
+ print("πŸ“ Notes:")
435
+ print(" - Estimates for HF Spaces CPU tier")
436
+ print(" - faster-whisper: First run downloads model (~30s)")
437
+ print(" - AIPipe: Network latency may add 1-2 seconds")
438
+ print(" - All times well within 3-minute task limit")
439
+
440
+ return transcriber
441
+
442
+
443
+ async def run_all_tests():
444
+ """Run all tests"""
445
+
446
+ print("\n" + "=" * 80)
447
+ print(" " * 15 + "MEDIA TRANSCRIBER TEST SUITE")
448
+ print(" " * 12 + "(Small Audio Files - 3 Minute Tasks)")
449
+ print("=" * 80)
450
+
451
+ transcriber = None
452
+
453
+ try:
454
+ # Test 1: Small audio files
455
+ transcriber = await test_small_audio_files()
456
+
457
+ # Test 2: Video rejection
458
+ if transcriber:
459
+ transcriber.cleanup()
460
+ transcriber = await test_video_rejection()
461
+
462
+ # Test 3: Format detection
463
+ if transcriber:
464
+ transcriber.cleanup()
465
+ transcriber = await test_format_detection()
466
+
467
+ # Test 4: Backend check
468
+ if transcriber:
469
+ transcriber.cleanup()
470
+ transcriber = await test_backend_check()
471
+
472
+ # Test 5: Performance estimates
473
+ if transcriber:
474
+ transcriber.cleanup()
475
+ transcriber = await test_performance_estimate()
476
+
477
+ if transcriber:
478
+ transcriber.cleanup()
479
+ transcriber = await test_speech_detection()
480
+
481
+ print("\n" + "=" * 80)
482
+ print(" " * 30 + "TESTS COMPLETE")
483
+ print("=" * 80)
484
+
485
+ print("\nβœ… All tests finished!")
486
+ print("\nπŸ“Š Summary:")
487
+ print(" β€’ Small audio files tested (< 30 seconds)")
488
+ print(" β€’ Video rejection verified")
489
+ print(" β€’ Format detection working")
490
+ print(" β€’ Performance suitable for 3-minute tasks")
491
+ print("\nπŸ’‘ For production: Install faster-whisper for free local transcription")
492
+
493
+ except Exception as e:
494
+ print("\n" + "=" * 80)
495
+ print(f"❌ Test suite error: {e}")
496
+ print("=" * 80)
497
+ logger.error("Test suite failed", exc_info=True)
498
+
499
+ finally:
500
+ if transcriber:
501
+ transcriber.cleanup()
502
+ print("\n🧹 Cleanup complete")
503
+
504
+
505
+ if __name__ == "__main__":
506
+ asyncio.run(run_all_tests())