alexnasa commited on
Commit
4b1c031
·
verified ·
1 Parent(s): 23e7175

Upload 24 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/french-ladder.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ assets/french-long.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ assets/french-movie.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ assets/german-5.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ assets/german.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ assets/italian.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ assets/movie.mp4 filter=lfs diff=lfs merge=lfs -text
43
+ assets/noisy-french.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ assets/old-french.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ assets/popup-2.mp4 filter=lfs diff=lfs merge=lfs -text
46
+ assets/port.mp4 filter=lfs diff=lfs merge=lfs -text
47
+ assets/rus.mp4 filter=lfs diff=lfs merge=lfs -text
48
+ assets/spanish-2.mp4 filter=lfs diff=lfs merge=lfs -text
49
+ assets/spanish-3.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ assets/spanish.mp4 filter=lfs diff=lfs merge=lfs -text
51
+ assets/trolls.mp4 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,13 @@
1
- ---
2
- title: OutofLipSync
3
- emoji: 📚
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 6.1.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
+ ---
2
+ title: OutofSync
3
+ emoji: 💋
4
+ colorFrom: yellow
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.36.2
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Translate and Lipsync any video clips to English
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,1113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import subprocess
3
+ from huggingface_hub import snapshot_download, hf_hub_download
4
+
5
+ def sh(cmd): subprocess.check_call(cmd, shell=True)
6
+
7
+ snapshot_download(
8
+ repo_id = "alexnasa/outofsync",
9
+ local_dir = "./outofsync"
10
+ )
11
+
12
+ sh("cd outofsync && pip install . && cd ..")
13
+ sh("pip uninstall onnxruntime onnxruntime-gpu -y && pip install onnxruntime-gpu")
14
+
15
+ import os
16
+ import shutil
17
+
18
+ src = "checkpoints" # your source folder
19
+ dst = "/home/user/.cache/torch/hub/checkpoints"
20
+
21
+ # Create destination folder if it doesn't exist
22
+ os.makedirs(dst, exist_ok=True)
23
+
24
+ # Copy each item from src → dst
25
+ for item in os.listdir(src):
26
+ s = os.path.join(src, item)
27
+ d = os.path.join(dst, item)
28
+
29
+ if os.path.isdir(s):
30
+ # Copy directory
31
+ shutil.copytree(s, d, dirs_exist_ok=True)
32
+ else:
33
+ # Copy file
34
+ shutil.copy2(s, d)
35
+
36
+ print("✓ Done copying checkpoints!")
37
+
38
+ import spaces
39
+ import io
40
+ import torch
41
+ import inspect
42
+ import pyannote.audio.core.task as task_module
43
+ from pathlib import Path
44
+ from pydub import AudioSegment
45
+ import math
46
+
47
+ # Collect all classes from pyannote.audio.core.task
48
+ safe_globals = [torch.torch_version.TorchVersion]
49
+ for name, obj in inspect.getmembers(task_module):
50
+ if inspect.isclass(obj):
51
+ safe_globals.append(obj)
52
+
53
+ # Allow these classes to be used when unpickling weights with weights_only=True
54
+ torch.serialization.add_safe_globals(safe_globals)
55
+
56
+ from typing import List, Dict
57
+ import time
58
+ from time_util import timer
59
+ import os, pathlib, sys, ctypes
60
+ import uuid
61
+ # preload the CNN component
62
+
63
+ ctypes.CDLL("/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_cnn.so.9")
64
+
65
+
66
+ # print(os.environ.get('LD_LIBRARY_PATH', ''))
67
+ import torch, ctranslate2, os
68
+
69
+ import numpy as np
70
+ from pydub import AudioSegment
71
+ from faster_whisper import WhisperModel
72
+ from pyannote.audio import Pipeline
73
+ from pyannote.audio.pipelines.utils.hook import ProgressHook
74
+ import gradio as gr
75
+
76
+ from pydub import AudioSegment
77
+ import srt
78
+ import io
79
+ from pydub import AudioSegment
80
+ import math
81
+ from datetime import timedelta
82
+ import torchaudio
83
+ import tigersound.look2hear.models
84
+
85
+ @spaces.GPU()
86
+ def print_ort():
87
+
88
+ import onnxruntime as ort
89
+ print(ort.get_available_providers())
90
+
91
+ print_ort()
92
+
93
+ current_dir = os.path.dirname(os.path.abspath(__file__))
94
+ snapshot_download("IndexTeam/IndexTTS-2", local_dir=os.path.join(current_dir,"checkpoints"))
95
+
96
+ dnr_model = tigersound.look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR").to("cuda").eval()
97
+
98
+ sh(f"pip install --no-deps git+https://github.com/OutofAi/index-tts.git")
99
+
100
+ from indextts.infer_v2 import IndexTTS2
101
+
102
+ MODE = 'local'
103
+ tts = IndexTTS2(model_dir="./checkpoints",
104
+ cfg_path=os.path.join("./checkpoints", "config.yaml"),
105
+ use_fp16=True,
106
+ use_deepspeed=False,
107
+ use_cuda_kernel=False,
108
+ )
109
+
110
+
111
+ os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
112
+
113
+ from lipsync import apply_lipsync
114
+
115
+
116
+ def split_subtitles_max_duration(
117
+ subtitles,
118
+ max_seconds: float = 10.0,
119
+ min_last_chunk_seconds: float = 1.0,
120
+ ):
121
+ """
122
+ Take a list of srt.Subtitle and return a new list where
123
+ no subtitle duration is longer than max_seconds, except that
124
+ the *last* chunk is allowed to exceed max_seconds slightly
125
+ if the leftover duration would otherwise be less than
126
+ min_last_chunk_seconds.
127
+
128
+ Text is split by words roughly evenly across the chunks.
129
+ """
130
+ max_td = timedelta(seconds=max_seconds)
131
+ new_subs = []
132
+ new_index = 1
133
+
134
+ for sub in subtitles:
135
+ start = sub.start
136
+ end = sub.end
137
+ duration = end - start
138
+ total_secs = duration.total_seconds()
139
+
140
+ # If already short enough, just copy it
141
+ if total_secs <= max_seconds:
142
+ new_subs.append(
143
+ srt.Subtitle(
144
+ index=new_index,
145
+ start=start,
146
+ end=end,
147
+ content=sub.content,
148
+ )
149
+ )
150
+ new_index += 1
151
+ continue
152
+
153
+ # Need to split this subtitle
154
+ words = sub.content.split()
155
+ if not words:
156
+ # No text, skip
157
+ continue
158
+
159
+ # --- Determine number of chunks, avoiding tiny last chunk ---
160
+ base_chunks = int(total_secs // max_seconds)
161
+ remainder = total_secs - base_chunks * max_seconds
162
+
163
+ if base_chunks == 0:
164
+ # total_secs > max_seconds due to earlier check, but just in case
165
+ num_chunks = 1
166
+ else:
167
+ if remainder == 0:
168
+ num_chunks = base_chunks
169
+ elif remainder < min_last_chunk_seconds:
170
+ # Don't create a tiny last chunk; merge its time into previous chunks
171
+ num_chunks = base_chunks
172
+ else:
173
+ num_chunks = base_chunks + 1
174
+
175
+ # Ensure at least one chunk
176
+ num_chunks = max(1, num_chunks)
177
+
178
+ # Words per chunk (roughly even)
179
+ words_per_chunk = max(1, int(math.ceil(len(words) / num_chunks)))
180
+
181
+ chunk_start = start
182
+ word_idx = 0
183
+
184
+ for chunk_idx in range(num_chunks):
185
+ # Last chunk takes us all the way to the original end,
186
+ # so it can be slightly > max_seconds if needed.
187
+ if chunk_idx == num_chunks - 1:
188
+ chunk_end = end
189
+ else:
190
+ chunk_end = min(end, chunk_start + max_td)
191
+
192
+ if chunk_end <= chunk_start:
193
+ break
194
+
195
+ chunk_words = words[word_idx:word_idx + words_per_chunk]
196
+ word_idx += words_per_chunk
197
+
198
+ if not chunk_words:
199
+ break
200
+
201
+ new_subs.append(
202
+ srt.Subtitle(
203
+ index=new_index,
204
+ start=chunk_start,
205
+ end=chunk_end,
206
+ content=" ".join(chunk_words),
207
+ )
208
+ )
209
+ new_index += 1
210
+
211
+ chunk_start = chunk_end
212
+
213
+ return new_subs
214
+
215
+
216
+ def split_text_into_chunks(text, max_chars=400):
217
+ """
218
+ Rough splitter: breaks text into chunks <= max_chars,
219
+ preferring to split at sentence boundaries, then spaces.
220
+ """
221
+ text = text.strip()
222
+ chunks = []
223
+
224
+ while len(text) > max_chars:
225
+ # Try to split at the last sentence end before max_chars
226
+ split_at = max(
227
+ text.rfind(". ", 0, max_chars),
228
+ text.rfind("! ", 0, max_chars),
229
+ text.rfind("? ", 0, max_chars),
230
+ )
231
+
232
+ # If there was no sentence boundary, fall back to last space
233
+ if split_at == -1:
234
+ split_at = text.rfind(" ", 0, max_chars)
235
+
236
+ # If still nothing, just hard cut
237
+ if split_at == -1:
238
+ split_at = max_chars
239
+
240
+ chunk = text[:split_at + 1].strip()
241
+ chunks.append(chunk)
242
+ text = text[split_at + 1 :].strip()
243
+
244
+ if text:
245
+ chunks.append(text)
246
+
247
+ return chunks
248
+
249
+
250
+ def sh(cmd): subprocess.check_call(cmd, shell=True)
251
+
252
+ # sh("find / -name \"libcudnn*\" 2>/dev/null")
253
+ # --------------------
254
+ # CONFIG
255
+ # --------------------
256
+ MODEL_SIZE = "medium" # e.g. "small", "medium", "large-v2"
257
+ MIN_SEGMENT_SECONDS = 0.5 # only transcribe segments longer than this
258
+
259
+ # If your pyannote pipeline needs a HF token, set it here or via env var:
260
+ # HUGGINGFACE_TOKEN = "hf_..."
261
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
262
+
263
+ # --------------------
264
+ # LOAD GLOBAL MODELS (ONCE)
265
+ # --------------------
266
+ device = "cuda" if torch.cuda.is_available() else "cpu"
267
+
268
+ print(f"Loading pyannote diarization model...")
269
+ diarization_pipeline = Pipeline.from_pretrained(
270
+ "pyannote/speaker-diarization-3.1"
271
+ )
272
+
273
+ # --------------------
274
+ # HELPERS
275
+ # --------------------
276
+ def format_timestamp(ts: float) -> str:
277
+ """Convert seconds to SRT timestamp format."""
278
+ hrs = int(ts // 3600)
279
+ mins = int((ts % 3600) // 60)
280
+ secs = int(ts % 60)
281
+ ms = int((ts - int(ts)) * 1000)
282
+ return f"{hrs:02d}:{mins:02d}:{secs:02d},{ms:03d}"
283
+
284
+
285
+ def extract_audio_to_wav(input_video: str, output_dir: str):
286
+
287
+ audio_file = os.path.join(output_dir, "audio_og.wav")
288
+ background_file = os.path.join(output_dir, "background_og.wav")
289
+ vocal_file = os.path.join(output_dir, "vocal_og.wav")
290
+ effect_file = os.path.join(output_dir, "effect_og.wav")
291
+
292
+ audio_16k_file = os.path.join(output_dir, "audio_16k.wav")
293
+
294
+ video_path = input_video
295
+ separator_dir = Path(os.path.join(output_dir, "separator_directory"))
296
+ os.makedirs(separator_dir, exist_ok=True)
297
+
298
+
299
+ # Extract raw audio
300
+ cmd = [
301
+ "ffmpeg",
302
+ "-loglevel", "error",
303
+ "-i", video_path,
304
+ "-vn",
305
+ "-acodec", "pcm_s16le",
306
+ "-ar", "44100",
307
+ "-ac", "2",
308
+ audio_file
309
+ ]
310
+ subprocess.run(cmd, check=True)
311
+
312
+ audio, sr = torchaudio.load(audio_file)
313
+ audio = audio.to("cuda")
314
+
315
+ with torch.no_grad():
316
+ dialog, effect, music = dnr_model(audio[None])
317
+
318
+ torchaudio.save(vocal_file, dialog.cpu(), sr)
319
+ torchaudio.save(effect_file, effect.cpu(), sr)
320
+ torchaudio.save(background_file, music.cpu(), sr)
321
+
322
+ # Convert vocals to 16k mono
323
+ cmd = [
324
+ "ffmpeg",
325
+ "-loglevel", "error",
326
+ "-y",
327
+ "-i", vocal_file,
328
+ "-ac", "1",
329
+ "-ar", "16000",
330
+ "-acodec", "pcm_s16le",
331
+ audio_16k_file
332
+ ]
333
+ subprocess.run(cmd, check=True)
334
+
335
+ return audio_file, effect_file, background_file, audio_16k_file, vocal_file
336
+
337
+
338
+ def diarize_audio(audio_path: str) -> List[Dict]:
339
+ """Run pyannote diarization and return segments."""
340
+
341
+ diarization_pipeline.to(torch.device(device))
342
+
343
+ with ProgressHook() as hook:
344
+ diarization_result = diarization_pipeline(audio_path, hook=hook)
345
+
346
+ segments = []
347
+ for segment, _, speaker in diarization_result.itertracks(yield_label=True):
348
+ duration = segment.end - segment.start
349
+ if duration >= MIN_SEGMENT_SECONDS:
350
+ segments.append(
351
+ {
352
+ "start": float(segment.start),
353
+ "end": float(segment.end),
354
+ "speaker": speaker,
355
+ }
356
+ )
357
+
358
+ segments.sort(key=lambda x: x["start"])
359
+ return segments
360
+
361
+
362
+ def chunk_to_float32(chunk: AudioSegment) -> np.ndarray:
363
+ """Convert a pydub chunk to mono 16kHz float32 numpy array in [-1, 1]."""
364
+ chunk = chunk.set_frame_rate(16000).set_channels(1)
365
+ samples = np.array(chunk.get_array_of_samples())
366
+
367
+ # Normalize based on sample width
368
+ if chunk.sample_width == 2: # 16-bit
369
+ samples = samples.astype(np.float32) / 32768.0
370
+ elif chunk.sample_width == 4: # 32-bit
371
+ samples = samples.astype(np.float32) / 2147483648.0
372
+ else:
373
+ samples = samples.astype(np.float32)
374
+
375
+ return samples
376
+
377
+
378
+ def transcribe_segment(whisper_model, samples: np.ndarray) -> str:
379
+ """Transcribe+translate a single segment with faster-whisper."""
380
+ segment_text_parts = []
381
+
382
+
383
+ segments, info = whisper_model.transcribe(
384
+ samples,
385
+ beam_size=1,
386
+ vad_filter=False, # diarization already detected speech
387
+ condition_on_previous_text=True, # independent segments
388
+ task="translate", # translate to English
389
+ word_timestamps=True,
390
+ )
391
+
392
+ for seg in segments:
393
+ if seg.text:
394
+ segment_text_parts.append(seg.text.strip())
395
+
396
+ return " ".join(segment_text_parts)
397
+
398
+ def transcribe_segment_words(
399
+ whisper_model,
400
+ samples: np.ndarray,
401
+ offset_sec: float,
402
+ speaker: str | None = None,
403
+ ):
404
+ """
405
+ Transcribe+translate a single diarization segment, returning a
406
+ list of word dicts with absolute timestamps.
407
+ """
408
+ words_out = []
409
+
410
+ segments, info = whisper_model.transcribe(
411
+ samples,
412
+ beam_size=1,
413
+ vad_filter=False, # diarization already detected speech
414
+ condition_on_previous_text=False, # better for hard cuts / segments
415
+ task="translate",
416
+ word_timestamps=True,
417
+ )
418
+
419
+ for seg in segments:
420
+ if not seg.words:
421
+ continue
422
+ for w in seg.words:
423
+ words_out.append(
424
+ {
425
+ "start": offset_sec + float(w.start),
426
+ "end": offset_sec + float(w.end),
427
+ "text": w.word,
428
+ "speaker": speaker,
429
+ }
430
+ )
431
+
432
+ return words_out
433
+
434
+ def words_to_subtitles(words, max_seconds: float = 10.0):
435
+ """
436
+ Group word-level timings into SRT subtitles, each up to max_seconds long,
437
+ cutting ONLY at word boundaries, AND never mixing speakers in the same subtitle.
438
+ Whenever the speaker changes, we close the current subtitle and start a new one.
439
+
440
+ Expects each word dict to have:
441
+ - "start" (float, seconds)
442
+ - "end" (float, seconds)
443
+ - "text" (str)
444
+ - "speaker" (str or None)
445
+ """
446
+ # sort just in case
447
+ words = sorted(words, key=lambda w: w["start"])
448
+
449
+ subtitles = []
450
+ current_words = []
451
+ current_start = None
452
+ current_speaker = None
453
+
454
+ index = 1
455
+
456
+ for w in words:
457
+ w_start = w["start"]
458
+ w_end = w["end"]
459
+ w_speaker = w.get("speaker")
460
+
461
+ if current_start is None:
462
+ # start first subtitle
463
+ current_start = w_start
464
+ current_words = [w]
465
+ current_speaker = w_speaker
466
+ continue
467
+
468
+ speaker_changed = (w_speaker != current_speaker)
469
+ duration_if_added = w_end - current_start
470
+ exceeds_max = duration_if_added > max_seconds
471
+
472
+ # If adding this word would:
473
+ # - exceed max_seconds, OR
474
+ # - cross into a different speaker,
475
+ # then we close the current subtitle and start a new one.
476
+ if (speaker_changed or exceeds_max) and current_words:
477
+ text = " ".join(x["text"] for x in current_words).strip()
478
+ sub_start = current_start
479
+ sub_end = current_words[-1]["end"]
480
+
481
+ subtitles.append(
482
+ srt.Subtitle(
483
+ index=index,
484
+ start=timedelta(seconds=sub_start),
485
+ end=timedelta(seconds=sub_end),
486
+ content=text,
487
+ )
488
+ )
489
+ index += 1
490
+
491
+ # start new subtitle from this word
492
+ current_start = w_start
493
+ current_words = [w]
494
+ current_speaker = w_speaker
495
+ else:
496
+ current_words.append(w)
497
+
498
+ # flush last subtitle
499
+ if current_words:
500
+ text = " ".join(x["text"] for x in current_words).strip()
501
+ sub_start = current_start
502
+ sub_end = current_words[-1]["end"]
503
+ subtitles.append(
504
+ srt.Subtitle(
505
+ index=index,
506
+ start=timedelta(seconds=sub_start),
507
+ end=timedelta(seconds=sub_end),
508
+ content=text,
509
+ )
510
+ )
511
+
512
+ return subtitles
513
+
514
+ def build_srt(segments: List[Dict], audio_wav: str, out_srt_path: str):
515
+ """
516
+ Generate SRT file from diarized segments and audio,
517
+ using word-level timestamps and grouping into ~10s subtitles.
518
+ """
519
+ audio = AudioSegment.from_file(audio_wav)
520
+
521
+ print(f"Loading faster-whisper model ({MODEL_SIZE})...")
522
+ whisper_model = WhisperModel(
523
+ MODEL_SIZE,
524
+ device="cuda",
525
+ compute_type="float16",
526
+ )
527
+
528
+ all_words = []
529
+
530
+ for i, seg in enumerate(segments, start=1):
531
+ start_sec = seg["start"]
532
+ end_sec = seg["end"]
533
+ speaker = seg["speaker"]
534
+
535
+ start_ms = int(start_sec * 1000)
536
+ end_ms = int(end_sec * 1000)
537
+ chunk = audio[start_ms:end_ms]
538
+
539
+ samples = chunk_to_float32(chunk)
540
+
541
+ # get words for this diar segment, with absolute times
542
+ seg_words = transcribe_segment_words(
543
+ whisper_model,
544
+ samples,
545
+ offset_sec=start_sec,
546
+ speaker=speaker,
547
+ )
548
+
549
+ all_words.extend(seg_words)
550
+ print(f"Diar segment {i} ({speaker}): {len(seg_words)} words")
551
+
552
+ # group words into ≤10s subtitles, word aligned
553
+ subtitles = words_to_subtitles(all_words, max_seconds=10.0)
554
+
555
+ # write SRT
556
+ with open(out_srt_path, "w", encoding="utf-8") as f:
557
+ f.write(srt.compose(subtitles))
558
+
559
+ def translate_video(video_file):
560
+
561
+ return process_video(video_file, False)
562
+
563
+ def translate_lipsync_video(video_file):
564
+
565
+ return process_video(video_file, True)
566
+
567
+ def run_example(video_file, allow_lipsync, duration):
568
+
569
+ with timer("processed"):
570
+ result = process_video(video_file, allow_lipsync, duration)
571
+
572
+ return result
573
+
574
+ @spaces.GPU(duration=350)
575
+ def process_video(video_file, allow_lipsync, duration = 30):
576
+ """
577
+ Gradio callback:
578
+ - video_file: temp file object/path from Gradio
579
+ - returns path to generated SRT file (for download)
580
+ """
581
+ if video_file is None:
582
+ raise gr.Error("Please upload an MP4 video.")
583
+
584
+ session_id = uuid.uuid4().hex
585
+
586
+ output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
587
+ os.makedirs(output_dir, exist_ok=True)
588
+
589
+ # Gradio's File/Video component gives dict or str depending on version
590
+ if isinstance(video_file, dict):
591
+ video_path = video_file.get("name") or video_file.get("path")
592
+ else:
593
+ video_path = video_file
594
+
595
+ if video_path is None or not os.path.exists(video_path):
596
+ raise gr.Error("Could not read uploaded video file.")
597
+
598
+ # Create temp directory to hold WAV + SRT
599
+ srt_path = os.path.join(output_dir, "diarized_translated.srt")
600
+
601
+ src_video_path = video_file
602
+
603
+ cropped_video_path = os.path.join(output_dir, "input_30s.mp4")
604
+
605
+ duration_s = int(duration)
606
+
607
+ cmd = [
608
+ "ffmpeg",
609
+ "-y",
610
+ "-i", src_video_path,
611
+ "-t", f"{duration_s}",
612
+ "-c", "copy", # stream copy, no re-encode
613
+ cropped_video_path,
614
+ ]
615
+ subprocess.run(cmd, check=True)
616
+ video_path = cropped_video_path
617
+
618
+ # 1. Extract audio
619
+ audio_wav, effect_wav, background_wav, audio_16k_wav, vocal_wav = extract_audio_to_wav(video_path, output_dir)
620
+
621
+ # 2. Diarization
622
+ segments = diarize_audio(audio_16k_wav)
623
+ if not segments:
624
+ raise gr.Error("No valid speech segments found for diarization.")
625
+
626
+ # 3. Build SRT from diarized segments + whisper
627
+ with timer("Generating srt"):
628
+ build_srt(segments, audio_16k_wav, srt_path)
629
+
630
+ # ---- ORIGINAL SRT (used for TTS) ----
631
+ with open(srt_path, "r", encoding="utf-8") as f:
632
+ srt_data = f.read()
633
+
634
+ subtitles = list(srt.parse(srt_data))
635
+
636
+ # Keep this list as-is for TTS timing
637
+ tts_subtitles = subtitles
638
+
639
+ # ---- CREATE 10s-MAX SRT FOR DOWNLOAD ----
640
+ max10_subtitles = tts_subtitles
641
+ # max10_subtitles = split_subtitles_max_duration(subtitles, max_seconds=10.0)
642
+
643
+ tts_subtitles = max10_subtitles
644
+
645
+ srt_10s_path = os.path.join(output_dir, "diarized_translated_max10s.srt")
646
+ with open(srt_10s_path, "w", encoding="utf-8") as f:
647
+ f.write(srt.compose(max10_subtitles))
648
+
649
+ # ---- TTS USING ORIGINAL SRT ----
650
+ last_end_seconds = tts_subtitles[-1].end.total_seconds()
651
+ total_ms = int((last_end_seconds + 1) * 1000)
652
+
653
+ timeline = AudioSegment.silent(duration=total_ms)
654
+
655
+ original_audio = AudioSegment.from_file(audio_wav)
656
+
657
+ MAX_BATCH_MS = 300_000 # ~5 minutes of target subtitle duration per batch
658
+
659
+ with timer("Generating speech"):
660
+ num_subs = len(tts_subtitles)
661
+ idx = 0
662
+
663
+ while idx < num_subs:
664
+ spk_prompts = [] # paths to src_prompt_*.wav
665
+ texts = [] # subtitle texts for this batch
666
+ out_paths = [] # where IndexTTS2 will save generated wavs
667
+ starts_ms = [] # for overlaying later
668
+ target_ms_list = [] # per-subtitle target durations
669
+ batch_ms_sum = 0
670
+
671
+ batch_start = idx
672
+
673
+ # ---- fill one batch until we hit ~MAX_BATCH_MS ----
674
+ while idx < num_subs:
675
+ sub = tts_subtitles[idx]
676
+
677
+ start_ms = int(sub.start.total_seconds() * 1000)
678
+ end_ms = int(sub.end.total_seconds() * 1000)
679
+ target_ms = max(end_ms - start_ms, 0)
680
+
681
+ # If adding this subtitle would exceed the limit and we already
682
+ # have something in the batch, stop and process the current batch.
683
+ if batch_ms_sum + target_ms > MAX_BATCH_MS and len(target_ms_list) > 0:
684
+ break
685
+
686
+ global_idx = idx
687
+
688
+ # 1) prompt audio for this subtitle
689
+ src_chunk = original_audio[start_ms:end_ms]
690
+ src_prompt_path = os.path.join(output_dir, f"src_prompt_{global_idx}.wav")
691
+ src_chunk.export(src_prompt_path, format="wav")
692
+
693
+ # 2) text + output path
694
+ text = sub.content.replace("\n", " ")
695
+ out_path = os.path.join(output_dir, f"gen_{global_idx}.wav")
696
+
697
+ spk_prompts.append(src_prompt_path)
698
+ texts.append(text)
699
+ out_paths.append(out_path)
700
+ starts_ms.append(start_ms)
701
+ target_ms_list.append(target_ms)
702
+
703
+ batch_ms_sum += target_ms
704
+ idx += 1
705
+
706
+ print(f"batch from {batch_start} to {idx - 1}, batch_ms_sum: {batch_ms_sum}")
707
+
708
+ # --- call batched TTS once for this batch ---
709
+ do_sample = True
710
+ top_p = 0.8
711
+ top_k = 30
712
+ temperature = 0.8
713
+ length_penalty = 0.0
714
+ num_beams = 3
715
+ repetition_penalty = 10.0
716
+ max_mel_tokens = 1500
717
+
718
+ # You could compute some aggregate target_length_ms here if your API supports it,
719
+ # e.g. avg or max(target_ms_list). For now, keep None as before.
720
+ tts_outputs = tts.infer_batch(
721
+ spk_audio_prompts=spk_prompts,
722
+ texts=texts,
723
+ output_paths=out_paths,
724
+ emo_audio_prompts=None,
725
+ emo_alpha=1.0,
726
+ emo_vectors=None,
727
+ use_emo_text=False,
728
+ emo_texts=None,
729
+ use_random=False,
730
+ interval_silence=200,
731
+ verbose=False,
732
+ max_text_tokens_per_segment=120,
733
+ speed=1.0,
734
+ target_length_ms=target_ms_list,
735
+ do_sample=do_sample,
736
+ top_p=top_p,
737
+ top_k=top_k,
738
+ temperature=temperature,
739
+ length_penalty=length_penalty,
740
+ num_beams=num_beams,
741
+ repetition_penalty=repetition_penalty,
742
+ max_mel_tokens=max_mel_tokens,
743
+ )
744
+
745
+ # --- read generated wavs and overlay them ---
746
+ for local_idx, out_path in enumerate(tts_outputs):
747
+ start_ms = starts_ms[local_idx]
748
+
749
+ seg = AudioSegment.from_file(out_path, format="wav")
750
+ seg = seg - 2
751
+ timeline = timeline.overlay(seg, position=start_ms)
752
+
753
+ # cleanup
754
+ os.remove(out_path)
755
+ os.remove(spk_prompts[local_idx])
756
+
757
+ # -------------------------------------------------------
758
+ # Bring back original dialog in the *gaps* (grunts, etc.)
759
+ # -------------------------------------------------------
760
+ # Load separated dialog track
761
+ dialog = AudioSegment.from_file(vocal_wav)
762
+
763
+ # Make sure it matches the TTS timeline parameters
764
+ dialog = dialog.set_frame_rate(timeline.frame_rate).set_channels(timeline.channels)
765
+
766
+ total_len_ms = len(timeline)
767
+
768
+ # Collect speech regions from subtitles (approximate "where TTS will speak")
769
+ speech_regions = []
770
+ for sub in tts_subtitles:
771
+ start_ms = int(sub.start.total_seconds() * 1000)
772
+ end_ms = int(sub.end.total_seconds() * 1000)
773
+ # clamp to track length
774
+ start_ms = max(0, min(start_ms, total_len_ms))
775
+ end_ms = max(0, min(end_ms, total_len_ms))
776
+ if end_ms > start_ms:
777
+ speech_regions.append((start_ms, end_ms))
778
+
779
+ # Merge overlapping/adjacent regions
780
+ speech_regions.sort()
781
+ merged = []
782
+ for s, e in speech_regions:
783
+ if not merged:
784
+ merged.append([s, e])
785
+ else:
786
+ last_s, last_e = merged[-1]
787
+ if s <= last_e: # overlap or touch
788
+ merged[-1][1] = max(last_e, e)
789
+ else:
790
+ merged.append([s, e])
791
+
792
+ # Compute the complement: regions where there's NO subtitle (gaps)
793
+ gaps = []
794
+ cursor = 0
795
+ for s, e in merged:
796
+ if cursor < s:
797
+ gaps.append((cursor, s))
798
+ cursor = max(cursor, e)
799
+ if cursor < total_len_ms:
800
+ gaps.append((cursor, total_len_ms))
801
+
802
+ # Overlay original dialog only in those gaps
803
+ MIN_GAP_MS = 10 # ignore ultra-tiny gaps
804
+
805
+ for g_start, g_end in gaps:
806
+ if g_end - g_start < MIN_GAP_MS:
807
+ continue
808
+
809
+ # Extract that piece of the original dialog
810
+ original_chunk = dialog[g_start:g_end]
811
+ original_chunk = original_chunk + 6
812
+
813
+ timeline = timeline.overlay(original_chunk, position=g_start)
814
+
815
+
816
+ video_in = video_file
817
+ audio_in = output_dir + "/final_output.wav"
818
+ audio_16k_in = output_dir + "/final_16k_output.wav"
819
+
820
+ # ---------- 5. Mix background + new TTS vocal ----------
821
+
822
+ if background_wav is not None:
823
+ eff = AudioSegment.from_file(effect_wav)
824
+ bg = AudioSegment.from_file(background_wav)
825
+
826
+
827
+
828
+ # If background is shorter than the TTS timeline, loop it
829
+ if len(eff) < len(timeline):
830
+ loops = math.ceil(len(timeline) / len(eff))
831
+ eff = eff * loops
832
+
833
+ if len(bg) < len(timeline):
834
+ loops = math.ceil(len(timeline) / len(bg))
835
+ bg = bg * loops
836
+
837
+
838
+
839
+ # Cut or match to TTS length
840
+ eff = eff[:len(timeline)]
841
+ bg = bg[:len(timeline)]
842
+
843
+
844
+ bg = bg + 6
845
+ eff = eff + 6
846
+
847
+ eff_timeline = eff.overlay(timeline)
848
+ final_audio = bg.overlay(eff_timeline)
849
+ final_16k_audio = timeline.set_frame_rate(16000).set_channels(1)
850
+ else:
851
+ # Fallback: no background found, just use TTS
852
+ final_audio = timeline
853
+ final_16k_audio = timeline
854
+
855
+ final_audio.export(audio_in, format="wav")
856
+ final_16k_audio.export(audio_16k_in, format="wav")
857
+
858
+ print(f"Done! Saved to {audio_in}")
859
+
860
+ lipsynced_video = output_dir + "/output_with_lipsync_16k.mp4"
861
+
862
+ if allow_lipsync:
863
+ apply_lipsync(video_in, audio_16k_in, lipsynced_video)
864
+ else:
865
+ lipsynced_video = video_in
866
+
867
+ video_out = output_dir + "/output_with_lipsync.mp4"
868
+
869
+
870
+ cmd = [
871
+ "ffmpeg",
872
+ "-loglevel", "error",
873
+ "-y", # overwrite output file
874
+ "-i", lipsynced_video, # input video
875
+ "-i", audio_in, # new audio
876
+ "-c:v", "copy", # do not re-encode video
877
+ "-map", "0:v:0", # take video from input 0
878
+ "-map", "1:a:0", # take audio from input 1
879
+ "-shortest", # stop when either track ends
880
+ video_out,
881
+ ]
882
+
883
+ subprocess.run(cmd, check=True)
884
+
885
+
886
+ # IMPORTANT: return the 10s-max SRT for download
887
+ return video_out, srt_10s_path, audio_16k_in
888
+
889
+
890
+
891
+ css = """
892
+ #col-container {
893
+ margin: 0 auto;
894
+ max-width: 1600px;
895
+ }
896
+ #modal-container {
897
+ width: 100vw; /* Take full viewport width */
898
+ height: 100vh; /* Take full viewport height (optional) */
899
+ display: flex;
900
+ justify-content: center; /* Center content horizontally */
901
+ align-items: center; /* Center content vertically if desired */
902
+ }
903
+ #modal-content {
904
+ width: 100%;
905
+ max-width: 700px; /* Limit content width */
906
+ margin: 0 auto;
907
+ border-radius: 8px;
908
+ padding: 1.5rem;
909
+ }
910
+ #step-column {
911
+ padding: 10px;
912
+ border-radius: 8px;
913
+ box-shadow: var(--card-shadow);
914
+ margin: 10px;
915
+ }
916
+ #col-showcase {
917
+ margin: 0 auto;
918
+ max-width: 1100px;
919
+ }
920
+ .button-gradient {
921
+ background: linear-gradient(45deg, rgb(255, 65, 108), rgb(255, 75, 43), rgb(255, 155, 0), rgb(255, 65, 108)) 0% 0% / 400% 400%;
922
+ border: none;
923
+ padding: 14px 28px;
924
+ font-size: 16px;
925
+ font-weight: bold;
926
+ color: white;
927
+ border-radius: 10px;
928
+ cursor: pointer;
929
+ transition: 0.3s ease-in-out;
930
+ animation: 2s linear 0s infinite normal none running gradientAnimation;
931
+ box-shadow: rgba(255, 65, 108, 0.6) 0px 4px 10px;
932
+ }
933
+ .toggle-container {
934
+ display: inline-flex;
935
+ background-color: #ffd6ff; /* light pink background */
936
+ border-radius: 9999px;
937
+ padding: 4px;
938
+ position: relative;
939
+ width: fit-content;
940
+ font-family: sans-serif;
941
+ }
942
+ .toggle-container input[type="radio"] {
943
+ display: none;
944
+ }
945
+ .toggle-container label {
946
+ position: relative;
947
+ z-index: 2;
948
+ flex: 1;
949
+ text-align: center;
950
+ font-weight: 700;
951
+ color: #4b2ab5; /* dark purple text for unselected */
952
+ padding: 6px 22px;
953
+ border-radius: 9999px;
954
+ cursor: pointer;
955
+ transition: color 0.25s ease;
956
+ }
957
+ /* Moving highlight */
958
+ .toggle-highlight {
959
+ position: absolute;
960
+ top: 4px;
961
+ left: 4px;
962
+ width: calc(50% - 4px);
963
+ height: calc(100% - 8px);
964
+ background-color: #4b2ab5; /* dark purple background */
965
+ border-radius: 9999px;
966
+ transition: transform 0.25s ease;
967
+ z-index: 1;
968
+ }
969
+ /* When "True" is checked */
970
+ #true:checked ~ label[for="true"] {
971
+ color: #ffd6ff; /* light pink text */
972
+ }
973
+ /* When "False" is checked */
974
+ #false:checked ~ label[for="false"] {
975
+ color: #ffd6ff; /* light pink text */
976
+ }
977
+ /* Move highlight to right side when False is checked */
978
+ #false:checked ~ .toggle-highlight {
979
+ transform: translateX(100%);
980
+ }
981
+ """
982
+
983
+
984
+ with gr.Blocks(css=css) as demo:
985
+
986
+ with gr.Column(elem_id="col-container"):
987
+ gr.HTML(
988
+ """
989
+ <div style="text-align: center;">
990
+ <p style="font-size:16px; display: inline; margin: 0;">
991
+ <strong>OutofSync </strong>
992
+ </p>
993
+ <p style="font-size:16px; display: inline; margin: 0;">
994
+ -- HF Space By:
995
+ </p>
996
+ <a href="https://huggingface.co/alexnasa" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
997
+ <img src="https://img.shields.io/badge/🤗-Follow Me-yellow.svg">
998
+ </a>
999
+ <a href="https://www.buymeacoffee.com/outofai" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;" target="_blank"><img src="https://img.shields.io/badge/-buy_me_a%C2%A0coffee-red?logo=buy-me-a-coffee" alt="Buy Me A Coffee"></a>
1000
+ </div>
1001
+ """
1002
+ )
1003
+
1004
+ with gr.Row():
1005
+ with gr.Column(elem_id="step-column"):
1006
+ gr.HTML("""
1007
+ <div>
1008
+ <span style="font-size: 24px;">1. Upload a Video</span><br>
1009
+ </div>
1010
+ """)
1011
+
1012
+ video_input = gr.Video(
1013
+ label="OG Clip",
1014
+ height=512
1015
+ )
1016
+
1017
+ with gr.Column(elem_id="step-column"):
1018
+ gr.HTML("""
1019
+ <div>
1020
+ <span style="font-size: 24px;">2. Translate + 💋 </span><br>
1021
+ </div>
1022
+ """)
1023
+
1024
+ video_output = gr.Video(label="Output", height=512)
1025
+ lipsync = gr.Checkbox(label="Lipsync", value=False, visible=False)
1026
+ duration = gr.Slider(0, 30, 30, step=10)
1027
+ translate_btn = gr.Button("🤹‍♂️ Translate")
1028
+ translate_lipsync_btn = gr.Button("🤹‍♂️ Translate + 💋 Lipsync", variant='primary', elem_classes="button-gradient")
1029
+
1030
+ with gr.Column(elem_id="step-column"):
1031
+ vocal_16k_output = gr.File(label="Vocal 16k", visible=False)
1032
+ srt_output = gr.File(label="Download translated diarized SRT", visible=False)
1033
+
1034
+ cached_examples = gr.Examples(
1035
+ examples=[
1036
+
1037
+ [
1038
+ "assets/popup-2.mp4",
1039
+ False,
1040
+ 10
1041
+ ],
1042
+
1043
+ [
1044
+ "assets/popup-2.mp4",
1045
+ False,
1046
+ 20
1047
+ ],
1048
+
1049
+ [
1050
+ "assets/popup-2.mp4",
1051
+ False,
1052
+ 30
1053
+ ],
1054
+
1055
+ [
1056
+ "assets/german.mp4",
1057
+ True,
1058
+ 10
1059
+ ],
1060
+
1061
+ [
1062
+ "assets/popup-2.mp4",
1063
+ True,
1064
+ 20
1065
+ ],
1066
+
1067
+ [
1068
+ "assets/popup-2.mp4",
1069
+ True,
1070
+ 30
1071
+ ],
1072
+
1073
+ [
1074
+ "assets/popup-2.mp4",
1075
+ True,
1076
+ 10
1077
+ ],
1078
+
1079
+ [
1080
+ "assets/italian.mp4",
1081
+ True,
1082
+ 10
1083
+ ],
1084
+
1085
+ [
1086
+ "assets/french-movie.mp4",
1087
+ True,
1088
+ 10
1089
+ ],
1090
+
1091
+ ],
1092
+ label="Cached Examples",
1093
+ fn=process_video,
1094
+ inputs=[video_input, lipsync, duration],
1095
+ outputs=[video_output, srt_output, vocal_16k_output],
1096
+ cache_examples=True
1097
+ )
1098
+
1099
+ translate_btn.click(
1100
+ fn=translate_video,
1101
+ inputs=[video_input],
1102
+ outputs=[video_output, srt_output, vocal_16k_output],
1103
+ )
1104
+
1105
+ translate_lipsync_btn.click(
1106
+ fn=translate_lipsync_video,
1107
+ inputs=[video_input],
1108
+ outputs=[video_output, srt_output, vocal_16k_output],
1109
+ )
1110
+
1111
+ if __name__ == "__main__":
1112
+ demo.queue()
1113
+ demo.launch()
assets/french-ladder.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c7242b9627b46650a4189feaba08226eb31110a61fb19773dd8d813e7e8a2f4
3
+ size 673836
assets/french-long.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb9d6be501d45cbd154dfa21e986bcf972aed1f6bf89ee99cceb2fdd03da3366
3
+ size 1591201
assets/french-movie.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0d46088490d5928f632c78ccabd91e1b122268e7009eda4d0b0ffa5dd90f659
3
+ size 610342
assets/german-5.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc3a75afe756c44aad4541c6ef2dc6ac2cfec9507594fe6cc7b005e4c8cea83a
3
+ size 952730
assets/german.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab85fcfcccbd49ab59b2bc05a927c0ab5b6683ea34c0edee45f46b8472c87652
3
+ size 387333
assets/italian.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc9b510c734b833d7502cb3e2f34f0b5c3018de6bb82cb531fd0ec4585a4d8a
3
+ size 458458
assets/movie.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecbb2f43ff7614b1e825ab331fce19907bbb3343cef98e0dacf62ca9c4463ed4
3
+ size 17398047
assets/noisy-french.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1884cd59445f274498698aaab1fc31706b10a7ef64016a779ac7dcef23f9d46
3
+ size 902524
assets/old-french.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7029400cc0be92df96e23c668f72b589b2f2fe3eaa07bd9267d86cc8ccdbf5a
3
+ size 373654
assets/popup-2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b706c1efe8bbbf50521280990011e401c81905005eebdb5ccd2d53392b677621
3
+ size 77033055
assets/port.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99b0b97048d46629baf7e81b5efc0fbb6fdc5cd338e71dc9664a9aade1f5b1be
3
+ size 3674780
assets/rus.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7df70f3d85dc8b1eca364b4665899efd24c8808f43113ef3b1412a2096ce796
3
+ size 2288293
assets/spanish-2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e07fba8382d040f191e67fd847e62d31dc8a607b62bc5bdb2611c1f4998dce1a
3
+ size 129943
assets/spanish-3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d87c8f424c78f531c8cb0ee4a7b6d9d0fc5aa19815bb7f0cac3a645305dcf757
3
+ size 442383
assets/spanish.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6190dc37f372b4d9de0959acbfd89f72c5515169bd70a723b46e875d3a466627
3
+ size 839466
assets/trolls.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e6a0fd5e6202a9e234a51c297d9a6db35a0fe87c813eb2c8599b568004f62b4
3
+ size 9604495
checkpoints/2DFAN4-cd938726ad.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd938726adb1f15f361263cce2db9cb820c42585fa8796ec72ce19107f369a46
3
+ size 96316515
checkpoints/mobilenet0.25_Final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2979b33ffafda5d74b6948cd7a5b9a7a62f62b949cef24e95fd15d2883a65220
3
+ size 1789735
checkpoints/mobilenet_224_model_best_gdconv_external.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:826b3c902e70e1eeb177f35c73198af0714f74502fe7bd3cdea42e847b1ca30f
3
+ size 15239204
lipsync.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from outofsync.latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
3
+ import torch
4
+
5
+ pipeline = LipsyncPipeline(
6
+ scheduler=None,
7
+ vae=None,
8
+ audio_encoder=None,
9
+ unet=None,
10
+ ).to("cuda")
11
+
12
+ def apply_lipsync(video_input_path, audio_path, video_out_path):
13
+
14
+ torch.manual_seed(1234)
15
+
16
+ print(f"Initial seed: {torch.initial_seed()}")
17
+
18
+ pipeline(
19
+ video_path=video_input_path,
20
+ audio_path=audio_path,
21
+ video_out_path=video_out_path,
22
+ video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
23
+ num_frames=16,
24
+ num_inference_steps=20,
25
+ guidance_scale=1.0,
26
+ weight_dtype=torch.float16,
27
+ width=256,
28
+ height=256,
29
+ )
30
+
31
+ return video_out_path
requirements.txt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Core Torch Stack ---
2
+ torchaudio==2.8.0
3
+ torchvision==0.23.0
4
+ triton
5
+ deepspeed==0.17.1
6
+ flash-attn-3 @ https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
7
+
8
+ # --- Whisper / ASR / Audio Processing ---
9
+ pyannote.audio
10
+ ctranslate2
11
+ faster-whisper[cuda12x]
12
+ pydub==0.25.1
13
+ srt
14
+ audio-separator==0.24.1
15
+ ffmpeg-python==0.2.0
16
+ python_speech_features==0.6
17
+ librosa==0.10.2.post1 # higher than 0.10.1
18
+
19
+ # --- NLP / Transformers / ML Utils ---
20
+ accelerate==1.8.1 # higher than 0.26.1
21
+ transformers==4.52.3 # higher than 4.52.1
22
+ tokenizers==0.21.0
23
+ sentencepiece
24
+ g2p-en==2.1.0
25
+ cn2an==0.5.22
26
+ textstat
27
+ omegaconf==2.3.0 # explicit highest version
28
+ munch==4.0.0
29
+ tqdm
30
+ json5==0.10.0
31
+
32
+ # --- Vision / Diffusion / Media ---
33
+ diffusers==0.33.1
34
+ huggingface-hub<1.0
35
+ imageio==2.27.0
36
+ decord==0.6.0
37
+ opencv-python==4.9.0.80 # same version, deduped
38
+ mediapipe==0.10.11
39
+ av
40
+ torch-fidelity==0.3.0
41
+ torchmetrics==1.3.1
42
+ lpips==0.1.4
43
+ face-alignment==1.4.1
44
+ insightface==0.7.3
45
+ kornia==0.8.0
46
+ scenedetect==0.6.1
47
+ moviepy==1.0.3
48
+
49
+ # --- Numerical / Scientific ---
50
+ numpy==1.26.2 # higher than 1.24.4
51
+ pandas==2.1.3 # higher than 2.0.3
52
+ matplotlib==3.8.2
53
+ numba==0.58.1
54
+ Cython==3.0.7
55
+ einops==0.7.0
56
+ ninja==1.11.1.1
57
+
58
+ # --- Model Repos & Tools ---
59
+ descript-audiotools @ git+https://github.com/descriptinc/audiotools.git
60
+ tigersound @ git+https://github.com/OutofAi/tigersound.git
61
+ tensorboard
62
+ hf-xet==1.1.8
63
+ DeepCache==0.1.1
64
+ ultralytics
65
+ batch-face
66
+ modelscope==1.27.0
67
+ onnxruntime-gpu==1.21.0
68
+
69
+ # --- Language-specific packages ---
70
+ jieba==0.42.1
71
+
72
+ # --- Environment-specific ---
73
+ WeTextProcessing; platform_machine != "Darwin"
74
+ wetext; platform_system == "Darwin"
time_util.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from contextlib import contextmanager
3
+
4
+ @contextmanager
5
+ def timer(name: str):
6
+ start = time.time()
7
+ print(f"{name}...")
8
+ yield
9
+ print(f" -> {name} completed in {time.time() - start:.2f} sec")