OutofLipSync

Running on Zero

App Files Files Community

alexnasa commited on 25 days ago

Commit

4b1c031

verified ·

1 Parent(s): 23e7175

Upload 24 files

Browse files

Files changed (25) hide show

.gitattributes +16 -0
README.md +13 -12
app.py +1113 -0
assets/french-ladder.mp4 +3 -0
assets/french-long.mp4 +3 -0
assets/french-movie.mp4 +3 -0
assets/german-5.mp4 +3 -0
assets/german.mp4 +3 -0
assets/italian.mp4 +3 -0
assets/movie.mp4 +3 -0
assets/noisy-french.mp4 +3 -0
assets/old-french.mp4 +3 -0
assets/popup-2.mp4 +3 -0
assets/port.mp4 +3 -0
assets/rus.mp4 +3 -0
assets/spanish-2.mp4 +3 -0
assets/spanish-3.mp4 +3 -0
assets/spanish.mp4 +3 -0
assets/trolls.mp4 +3 -0
checkpoints/2DFAN4-cd938726ad.zip +3 -0
checkpoints/mobilenet0.25_Final.pth +3 -0
checkpoints/mobilenet_224_model_best_gdconv_external.pth +3 -0
lipsync.py +31 -0
requirements.txt +74 -0
time_util.py +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/french-ladder.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/french-long.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/french-movie.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/german-5.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/german.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/italian.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/movie.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/noisy-french.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/old-french.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/popup-2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/port.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/rus.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/spanish-2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/spanish-3.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/spanish.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/trolls.mp4 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
----
-title: OutofLipSync
-emoji: 📚
-colorFrom: indigo
-colorTo: blue
-sdk: gradio
-sdk_version: 6.1.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: OutofSync
+emoji: 💋
+colorFrom: yellow
+colorTo: red
+sdk: gradio
+sdk_version: 5.36.2
+app_file: app.py
+pinned: false
+short_description: Translate and Lipsync any video clips to English
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,1113 @@

+import subprocess
+from huggingface_hub import snapshot_download, hf_hub_download
+def sh(cmd): subprocess.check_call(cmd, shell=True)
+snapshot_download(
+    repo_id = "alexnasa/outofsync",
+    local_dir = "./outofsync"
+)
+sh("cd outofsync && pip install . && cd ..")
+sh("pip uninstall onnxruntime onnxruntime-gpu -y && pip install onnxruntime-gpu")
+import os
+import shutil
+src = "checkpoints"  # your source folder
+dst = "/home/user/.cache/torch/hub/checkpoints"
+# Create destination folder if it doesn't exist
+os.makedirs(dst, exist_ok=True)
+# Copy each item from src → dst
+for item in os.listdir(src):
+    s = os.path.join(src, item)
+    d = os.path.join(dst, item)
+    if os.path.isdir(s):
+        # Copy directory
+        shutil.copytree(s, d, dirs_exist_ok=True)
+    else:
+        # Copy file
+        shutil.copy2(s, d)
+print("✓ Done copying checkpoints!")
+import spaces
+import io
+import torch
+import inspect
+import pyannote.audio.core.task as task_module
+from pathlib import Path
+from pydub import AudioSegment
+import math
+# Collect all classes from pyannote.audio.core.task
+safe_globals = [torch.torch_version.TorchVersion]
+for name, obj in inspect.getmembers(task_module):
+    if inspect.isclass(obj):
+        safe_globals.append(obj)
+# Allow these classes to be used when unpickling weights with weights_only=True
+torch.serialization.add_safe_globals(safe_globals)
+from typing import List, Dict
+import time
+from time_util import timer
+import os, pathlib, sys, ctypes
+import uuid
+# preload the CNN component
+ctypes.CDLL("/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_cnn.so.9")
+# print(os.environ.get('LD_LIBRARY_PATH', ''))
+import torch, ctranslate2, os
+import numpy as np
+from pydub import AudioSegment
+from faster_whisper import WhisperModel
+from pyannote.audio import Pipeline
+from pyannote.audio.pipelines.utils.hook import ProgressHook
+import gradio as gr
+from pydub import AudioSegment
+import srt
+import io
+from pydub import AudioSegment
+import math
+from datetime import timedelta
+import torchaudio
+import tigersound.look2hear.models
+@spaces.GPU()
+def print_ort():
+    import onnxruntime as ort
+    print(ort.get_available_providers())
+print_ort()
+current_dir = os.path.dirname(os.path.abspath(__file__))
+snapshot_download("IndexTeam/IndexTTS-2", local_dir=os.path.join(current_dir,"checkpoints"))
+dnr_model = tigersound.look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR").to("cuda").eval()
+sh(f"pip install --no-deps git+https://github.com/OutofAi/index-tts.git")
+from indextts.infer_v2 import IndexTTS2
+MODE = 'local'
+tts = IndexTTS2(model_dir="./checkpoints",
+                cfg_path=os.path.join("./checkpoints", "config.yaml"),
+                use_fp16=True,
+                use_deepspeed=False,
+                use_cuda_kernel=False,
+                )
+os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
+from lipsync import apply_lipsync
+def split_subtitles_max_duration(
+    subtitles,
+    max_seconds: float = 10.0,
+    min_last_chunk_seconds: float = 1.0,
+):
+    """
+    Take a list of srt.Subtitle and return a new list where
+    no subtitle duration is longer than max_seconds, except that
+    the *last* chunk is allowed to exceed max_seconds slightly
+    if the leftover duration would otherwise be less than
+    min_last_chunk_seconds.
+    Text is split by words roughly evenly across the chunks.
+    """
+    max_td = timedelta(seconds=max_seconds)
+    new_subs = []
+    new_index = 1
+    for sub in subtitles:
+        start = sub.start
+        end = sub.end
+        duration = end - start
+        total_secs = duration.total_seconds()
+        # If already short enough, just copy it
+        if total_secs <= max_seconds:
+            new_subs.append(
+                srt.Subtitle(
+                    index=new_index,
+                    start=start,
+                    end=end,
+                    content=sub.content,
+                )
+            )
+            new_index += 1
+            continue
+        # Need to split this subtitle
+        words = sub.content.split()
+        if not words:
+            # No text, skip
+            continue
+        # --- Determine number of chunks, avoiding tiny last chunk ---
+        base_chunks = int(total_secs // max_seconds)
+        remainder = total_secs - base_chunks * max_seconds
+        if base_chunks == 0:
+            # total_secs > max_seconds due to earlier check, but just in case
+            num_chunks = 1
+        else:
+            if remainder == 0:
+                num_chunks = base_chunks
+            elif remainder < min_last_chunk_seconds:
+                # Don't create a tiny last chunk; merge its time into previous chunks
+                num_chunks = base_chunks
+            else:
+                num_chunks = base_chunks + 1
+        # Ensure at least one chunk
+        num_chunks = max(1, num_chunks)
+        # Words per chunk (roughly even)
+        words_per_chunk = max(1, int(math.ceil(len(words) / num_chunks)))
+        chunk_start = start
+        word_idx = 0
+        for chunk_idx in range(num_chunks):
+            # Last chunk takes us all the way to the original end,
+            # so it can be slightly > max_seconds if needed.
+            if chunk_idx == num_chunks - 1:
+                chunk_end = end
+            else:
+                chunk_end = min(end, chunk_start + max_td)
+            if chunk_end <= chunk_start:
+                break
+            chunk_words = words[word_idx:word_idx + words_per_chunk]
+            word_idx += words_per_chunk
+            if not chunk_words:
+                break
+            new_subs.append(
+                srt.Subtitle(
+                    index=new_index,
+                    start=chunk_start,
+                    end=chunk_end,
+                    content=" ".join(chunk_words),
+                )
+            )
+            new_index += 1
+            chunk_start = chunk_end
+    return new_subs
+def split_text_into_chunks(text, max_chars=400):
+    """
+    Rough splitter: breaks text into chunks <= max_chars,
+    preferring to split at sentence boundaries, then spaces.
+    """
+    text = text.strip()
+    chunks = []
+    while len(text) > max_chars:
+        # Try to split at the last sentence end before max_chars
+        split_at = max(
+            text.rfind(". ", 0, max_chars),
+            text.rfind("! ", 0, max_chars),
+            text.rfind("? ", 0, max_chars),
+        )
+        # If there was no sentence boundary, fall back to last space
+        if split_at == -1:
+            split_at = text.rfind(" ", 0, max_chars)
+        # If still nothing, just hard cut
+        if split_at == -1:
+            split_at = max_chars
+        chunk = text[:split_at + 1].strip()
+        chunks.append(chunk)
+        text = text[split_at + 1 :].strip()
+    if text:
+        chunks.append(text)
+    return chunks
+def sh(cmd): subprocess.check_call(cmd, shell=True)
+# sh("find / -name \"libcudnn*\" 2>/dev/null")
+# --------------------
+# CONFIG
+# --------------------
+MODEL_SIZE = "medium"            # e.g. "small", "medium", "large-v2"
+MIN_SEGMENT_SECONDS = 0.5        # only transcribe segments longer than this
+# If your pyannote pipeline needs a HF token, set it here or via env var:
+# HUGGINGFACE_TOKEN = "hf_..."
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+# --------------------
+# LOAD GLOBAL MODELS (ONCE)
+# --------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading pyannote diarization model...")
+diarization_pipeline = Pipeline.from_pretrained(
+    "pyannote/speaker-diarization-3.1"
+)
+# --------------------
+# HELPERS
+# --------------------
+def format_timestamp(ts: float) -> str:
+    """Convert seconds to SRT timestamp format."""
+    hrs = int(ts // 3600)
+    mins = int((ts % 3600) // 60)
+    secs = int(ts % 60)
+    ms = int((ts - int(ts)) * 1000)
+    return f"{hrs:02d}:{mins:02d}:{secs:02d},{ms:03d}"
+def extract_audio_to_wav(input_video: str, output_dir: str):
+    audio_file = os.path.join(output_dir, "audio_og.wav")
+    background_file = os.path.join(output_dir, "background_og.wav")
+    vocal_file = os.path.join(output_dir, "vocal_og.wav")
+    effect_file = os.path.join(output_dir, "effect_og.wav")
+    audio_16k_file = os.path.join(output_dir, "audio_16k.wav")
+    video_path = input_video
+    separator_dir = Path(os.path.join(output_dir, "separator_directory"))
+    os.makedirs(separator_dir, exist_ok=True)
+    # Extract raw audio
+    cmd = [
+        "ffmpeg",
+        "-loglevel", "error",
+        "-i", video_path,
+        "-vn",
+        "-acodec", "pcm_s16le",
+        "-ar", "44100",
+        "-ac", "2",
+        audio_file
+    ]
+    subprocess.run(cmd, check=True)
+    audio, sr = torchaudio.load(audio_file)
+    audio = audio.to("cuda")
+    with torch.no_grad():
+        dialog, effect, music = dnr_model(audio[None])
+    torchaudio.save(vocal_file, dialog.cpu(), sr)
+    torchaudio.save(effect_file, effect.cpu(), sr)
+    torchaudio.save(background_file, music.cpu(), sr)
+    # Convert vocals to 16k mono
+    cmd = [
+        "ffmpeg",
+        "-loglevel", "error",
+        "-y",
+        "-i", vocal_file,
+        "-ac", "1",
+        "-ar", "16000",
+        "-acodec", "pcm_s16le",
+        audio_16k_file
+    ]
+    subprocess.run(cmd, check=True)
+    return audio_file, effect_file, background_file, audio_16k_file, vocal_file
+def diarize_audio(audio_path: str) -> List[Dict]:
+    """Run pyannote diarization and return segments."""
+    diarization_pipeline.to(torch.device(device))
+    with ProgressHook() as hook:
+        diarization_result = diarization_pipeline(audio_path, hook=hook)
+    segments = []
+    for segment, _, speaker in diarization_result.itertracks(yield_label=True):
+        duration = segment.end - segment.start
+        if duration >= MIN_SEGMENT_SECONDS:
+            segments.append(
+                {
+                    "start": float(segment.start),
+                    "end": float(segment.end),
+                    "speaker": speaker,
+                }
+            )
+    segments.sort(key=lambda x: x["start"])
+    return segments
+def chunk_to_float32(chunk: AudioSegment) -> np.ndarray:
+    """Convert a pydub chunk to mono 16kHz float32 numpy array in [-1, 1]."""
+    chunk = chunk.set_frame_rate(16000).set_channels(1)
+    samples = np.array(chunk.get_array_of_samples())
+    # Normalize based on sample width
+    if chunk.sample_width == 2:  # 16-bit
+        samples = samples.astype(np.float32) / 32768.0
+    elif chunk.sample_width == 4:  # 32-bit
+        samples = samples.astype(np.float32) / 2147483648.0
+    else:
+        samples = samples.astype(np.float32)
+    return samples
+def transcribe_segment(whisper_model, samples: np.ndarray) -> str:
+    """Transcribe+translate a single segment with faster-whisper."""
+    segment_text_parts = []
+    segments, info = whisper_model.transcribe(
+        samples,
+        beam_size=1,
+        vad_filter=False,                # diarization already detected speech
+        condition_on_previous_text=True,  # independent segments
+        task="translate",                # translate to English
+        word_timestamps=True,
+    )
+    for seg in segments:
+        if seg.text:
+            segment_text_parts.append(seg.text.strip())
+    return " ".join(segment_text_parts)
+def transcribe_segment_words(
+    whisper_model,
+    samples: np.ndarray,
+    offset_sec: float,
+    speaker: str | None = None,
+):
+    """
+    Transcribe+translate a single diarization segment, returning a
+    list of word dicts with absolute timestamps.
+    """
+    words_out = []
+    segments, info = whisper_model.transcribe(
+        samples,
+        beam_size=1,
+        vad_filter=False,                  # diarization already detected speech
+        condition_on_previous_text=False,  # better for hard cuts / segments
+        task="translate",
+        word_timestamps=True,
+    )
+    for seg in segments:
+        if not seg.words:
+            continue
+        for w in seg.words:
+            words_out.append(
+                {
+                    "start": offset_sec + float(w.start),
+                    "end": offset_sec + float(w.end),
+                    "text": w.word,
+                    "speaker": speaker,
+                }
+            )
+    return words_out
+def words_to_subtitles(words, max_seconds: float = 10.0):
+    """
+    Group word-level timings into SRT subtitles, each up to max_seconds long,
+    cutting ONLY at word boundaries, AND never mixing speakers in the same subtitle.
+    Whenever the speaker changes, we close the current subtitle and start a new one.
+    Expects each word dict to have:
+      - "start" (float, seconds)
+      - "end"   (float, seconds)
+      - "text"  (str)
+      - "speaker" (str or None)
+    """
+    # sort just in case
+    words = sorted(words, key=lambda w: w["start"])
+    subtitles = []
+    current_words = []
+    current_start = None
+    current_speaker = None
+    index = 1
+    for w in words:
+        w_start = w["start"]
+        w_end = w["end"]
+        w_speaker = w.get("speaker")
+        if current_start is None:
+            # start first subtitle
+            current_start = w_start
+            current_words = [w]
+            current_speaker = w_speaker
+            continue
+        speaker_changed = (w_speaker != current_speaker)
+        duration_if_added = w_end - current_start
+        exceeds_max = duration_if_added > max_seconds
+        # If adding this word would:
+        #   - exceed max_seconds, OR
+        #   - cross into a different speaker,
+        # then we close the current subtitle and start a new one.
+        if (speaker_changed or exceeds_max) and current_words:
+            text = " ".join(x["text"] for x in current_words).strip()
+            sub_start = current_start
+            sub_end = current_words[-1]["end"]
+            subtitles.append(
+                srt.Subtitle(
+                    index=index,
+                    start=timedelta(seconds=sub_start),
+                    end=timedelta(seconds=sub_end),
+                    content=text,
+                )
+            )
+            index += 1
+            # start new subtitle from this word
+            current_start = w_start
+            current_words = [w]
+            current_speaker = w_speaker
+        else:
+            current_words.append(w)
+    # flush last subtitle
+    if current_words:
+        text = " ".join(x["text"] for x in current_words).strip()
+        sub_start = current_start
+        sub_end = current_words[-1]["end"]
+        subtitles.append(
+            srt.Subtitle(
+                index=index,
+                start=timedelta(seconds=sub_start),
+                end=timedelta(seconds=sub_end),
+                content=text,
+            )
+        )
+    return subtitles
+def build_srt(segments: List[Dict], audio_wav: str, out_srt_path: str):
+    """
+    Generate SRT file from diarized segments and audio,
+    using word-level timestamps and grouping into ~10s subtitles.
+    """
+    audio = AudioSegment.from_file(audio_wav)
+    print(f"Loading faster-whisper model ({MODEL_SIZE})...")
+    whisper_model = WhisperModel(
+        MODEL_SIZE,
+        device="cuda",
+        compute_type="float16",
+    )
+    all_words = []
+    for i, seg in enumerate(segments, start=1):
+        start_sec = seg["start"]
+        end_sec = seg["end"]
+        speaker = seg["speaker"]
+        start_ms = int(start_sec * 1000)
+        end_ms = int(end_sec * 1000)
+        chunk = audio[start_ms:end_ms]
+        samples = chunk_to_float32(chunk)
+        # get words for this diar segment, with absolute times
+        seg_words = transcribe_segment_words(
+            whisper_model,
+            samples,
+            offset_sec=start_sec,
+            speaker=speaker,
+        )
+        all_words.extend(seg_words)
+        print(f"Diar segment {i} ({speaker}): {len(seg_words)} words")
+    # group words into ≤10s subtitles, word aligned
+    subtitles = words_to_subtitles(all_words, max_seconds=10.0)
+    # write SRT
+    with open(out_srt_path, "w", encoding="utf-8") as f:
+        f.write(srt.compose(subtitles))
+def translate_video(video_file):
+    return process_video(video_file, False)
+def translate_lipsync_video(video_file):
+    return process_video(video_file, True)
+def run_example(video_file, allow_lipsync, duration):
+    with timer("processed"):
+        result = process_video(video_file, allow_lipsync, duration)
+    return result
+@spaces.GPU(duration=350)
+def process_video(video_file, allow_lipsync, duration = 30):
+    """
+    Gradio callback:
+    - video_file: temp file object/path from Gradio
+    - returns path to generated SRT file (for download)
+    """
+    if video_file is None:
+        raise gr.Error("Please upload an MP4 video.")
+    session_id = uuid.uuid4().hex
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    os.makedirs(output_dir, exist_ok=True)
+    # Gradio's File/Video component gives dict or str depending on version
+    if isinstance(video_file, dict):
+        video_path = video_file.get("name") or video_file.get("path")
+    else:
+        video_path = video_file
+    if video_path is None or not os.path.exists(video_path):
+        raise gr.Error("Could not read uploaded video file.")
+    # Create temp directory to hold WAV + SRT
+    srt_path = os.path.join(output_dir, "diarized_translated.srt")
+    src_video_path = video_file
+    cropped_video_path = os.path.join(output_dir, "input_30s.mp4")
+    duration_s = int(duration)
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i", src_video_path,
+        "-t", f"{duration_s}",
+        "-c", "copy",          # stream copy, no re-encode
+        cropped_video_path,
+    ]
+    subprocess.run(cmd, check=True)
+    video_path = cropped_video_path
+    # 1. Extract audio
+    audio_wav, effect_wav, background_wav, audio_16k_wav, vocal_wav = extract_audio_to_wav(video_path, output_dir)
+    # 2. Diarization
+    segments = diarize_audio(audio_16k_wav)
+    if not segments:
+        raise gr.Error("No valid speech segments found for diarization.")
+    # 3. Build SRT from diarized segments + whisper
+    with timer("Generating srt"):
+        build_srt(segments, audio_16k_wav, srt_path)
+    # ---- ORIGINAL SRT (used for TTS) ----
+    with open(srt_path, "r", encoding="utf-8") as f:
+        srt_data = f.read()
+    subtitles = list(srt.parse(srt_data))
+    # Keep this list as-is for TTS timing
+    tts_subtitles = subtitles
+    # ---- CREATE 10s-MAX SRT FOR DOWNLOAD ----
+    max10_subtitles = tts_subtitles
+    # max10_subtitles = split_subtitles_max_duration(subtitles, max_seconds=10.0)
+    tts_subtitles = max10_subtitles
+    srt_10s_path = os.path.join(output_dir, "diarized_translated_max10s.srt")
+    with open(srt_10s_path, "w", encoding="utf-8") as f:
+        f.write(srt.compose(max10_subtitles))
+    # ---- TTS USING ORIGINAL SRT ----
+    last_end_seconds = tts_subtitles[-1].end.total_seconds()
+    total_ms = int((last_end_seconds + 1) * 1000)
+    timeline = AudioSegment.silent(duration=total_ms)
+    original_audio = AudioSegment.from_file(audio_wav)
+    MAX_BATCH_MS = 300_000  # ~5 minutes of target subtitle duration per batch
+    with timer("Generating speech"):
+        num_subs = len(tts_subtitles)
+        idx = 0
+        while idx < num_subs:
+            spk_prompts = []      # paths to src_prompt_*.wav
+            texts = []            # subtitle texts for this batch
+            out_paths = []        # where IndexTTS2 will save generated wavs
+            starts_ms = []        # for overlaying later
+            target_ms_list = []   # per-subtitle target durations
+            batch_ms_sum = 0
+            batch_start = idx
+            # ---- fill one batch until we hit ~MAX_BATCH_MS ----
+            while idx < num_subs:
+                sub = tts_subtitles[idx]
+                start_ms = int(sub.start.total_seconds() * 1000)
+                end_ms = int(sub.end.total_seconds() * 1000)
+                target_ms = max(end_ms - start_ms, 0)
+                # If adding this subtitle would exceed the limit and we already
+                # have something in the batch, stop and process the current batch.
+                if batch_ms_sum + target_ms > MAX_BATCH_MS and len(target_ms_list) > 0:
+                    break
+                global_idx = idx
+                # 1) prompt audio for this subtitle
+                src_chunk = original_audio[start_ms:end_ms]
+                src_prompt_path = os.path.join(output_dir, f"src_prompt_{global_idx}.wav")
+                src_chunk.export(src_prompt_path, format="wav")
+                # 2) text + output path
+                text = sub.content.replace("\n", " ")
+                out_path = os.path.join(output_dir, f"gen_{global_idx}.wav")
+                spk_prompts.append(src_prompt_path)
+                texts.append(text)
+                out_paths.append(out_path)
+                starts_ms.append(start_ms)
+                target_ms_list.append(target_ms)
+                batch_ms_sum += target_ms
+                idx += 1
+            print(f"batch from {batch_start} to {idx - 1}, batch_ms_sum: {batch_ms_sum}")
+            # --- call batched TTS once for this batch ---
+            do_sample = True
+            top_p = 0.8
+            top_k = 30
+            temperature = 0.8
+            length_penalty = 0.0
+            num_beams = 3
+            repetition_penalty = 10.0
+            max_mel_tokens = 1500
+            # You could compute some aggregate target_length_ms here if your API supports it,
+            # e.g. avg or max(target_ms_list). For now, keep None as before.
+            tts_outputs = tts.infer_batch(
+                spk_audio_prompts=spk_prompts,
+                texts=texts,
+                output_paths=out_paths,
+                emo_audio_prompts=None,
+                emo_alpha=1.0,
+                emo_vectors=None,
+                use_emo_text=False,
+                emo_texts=None,
+                use_random=False,
+                interval_silence=200,
+                verbose=False,
+                max_text_tokens_per_segment=120,
+                speed=1.0,
+                target_length_ms=target_ms_list,
+                do_sample=do_sample,
+                top_p=top_p,
+                top_k=top_k,
+                temperature=temperature,
+                length_penalty=length_penalty,
+                num_beams=num_beams,
+                repetition_penalty=repetition_penalty,
+                max_mel_tokens=max_mel_tokens,
+            )
+            # --- read generated wavs and overlay them ---
+            for local_idx, out_path in enumerate(tts_outputs):
+                start_ms = starts_ms[local_idx]
+                seg = AudioSegment.from_file(out_path, format="wav")
+                seg = seg - 2
+                timeline = timeline.overlay(seg, position=start_ms)
+                # cleanup
+                os.remove(out_path)
+                os.remove(spk_prompts[local_idx])
+    # -------------------------------------------------------
+    # Bring back original dialog in the *gaps* (grunts, etc.)
+    # -------------------------------------------------------
+    # Load separated dialog track
+    dialog = AudioSegment.from_file(vocal_wav)
+    # Make sure it matches the TTS timeline parameters
+    dialog = dialog.set_frame_rate(timeline.frame_rate).set_channels(timeline.channels)
+    total_len_ms = len(timeline)
+    # Collect speech regions from subtitles (approximate "where TTS will speak")
+    speech_regions = []
+    for sub in tts_subtitles:
+        start_ms = int(sub.start.total_seconds() * 1000)
+        end_ms = int(sub.end.total_seconds() * 1000)
+        # clamp to track length
+        start_ms = max(0, min(start_ms, total_len_ms))
+        end_ms = max(0, min(end_ms, total_len_ms))
+        if end_ms > start_ms:
+            speech_regions.append((start_ms, end_ms))
+    # Merge overlapping/adjacent regions
+    speech_regions.sort()
+    merged = []
+    for s, e in speech_regions:
+        if not merged:
+            merged.append([s, e])
+        else:
+            last_s, last_e = merged[-1]
+            if s <= last_e:  # overlap or touch
+                merged[-1][1] = max(last_e, e)
+            else:
+                merged.append([s, e])
+    # Compute the complement: regions where there's NO subtitle (gaps)
+    gaps = []
+    cursor = 0
+    for s, e in merged:
+        if cursor < s:
+            gaps.append((cursor, s))
+        cursor = max(cursor, e)
+    if cursor < total_len_ms:
+        gaps.append((cursor, total_len_ms))
+    # Overlay original dialog only in those gaps
+    MIN_GAP_MS = 10  # ignore ultra-tiny gaps
+    for g_start, g_end in gaps:
+        if g_end - g_start < MIN_GAP_MS:
+            continue
+        # Extract that piece of the original dialog
+        original_chunk = dialog[g_start:g_end]
+        original_chunk = original_chunk + 6
+        timeline = timeline.overlay(original_chunk, position=g_start)
+    video_in = video_file
+    audio_in = output_dir + "/final_output.wav"
+    audio_16k_in = output_dir + "/final_16k_output.wav"
+    # ---------- 5. Mix background + new TTS vocal ----------
+    if background_wav is not None:
+        eff = AudioSegment.from_file(effect_wav)
+        bg = AudioSegment.from_file(background_wav)
+        # If background is shorter than the TTS timeline, loop it
+        if len(eff) < len(timeline):
+            loops = math.ceil(len(timeline) / len(eff))
+            eff = eff * loops
+        if len(bg) < len(timeline):
+            loops = math.ceil(len(timeline) / len(bg))
+            bg = bg * loops
+        # Cut or match to TTS length
+        eff = eff[:len(timeline)]
+        bg = bg[:len(timeline)]
+        bg = bg + 6
+        eff = eff + 6
+        eff_timeline = eff.overlay(timeline)
+        final_audio = bg.overlay(eff_timeline)
+        final_16k_audio = timeline.set_frame_rate(16000).set_channels(1)
+    else:
+        # Fallback: no background found, just use TTS
+        final_audio = timeline
+        final_16k_audio = timeline
+    final_audio.export(audio_in, format="wav")
+    final_16k_audio.export(audio_16k_in, format="wav")
+    print(f"Done! Saved to {audio_in}")
+    lipsynced_video = output_dir + "/output_with_lipsync_16k.mp4"
+    if allow_lipsync:
+        apply_lipsync(video_in, audio_16k_in, lipsynced_video)
+    else:
+        lipsynced_video = video_in
+    video_out = output_dir + "/output_with_lipsync.mp4"
+    cmd = [
+        "ffmpeg",
+        "-loglevel", "error",
+        "-y",               # overwrite output file
+        "-i", lipsynced_video,     # input video
+        "-i", audio_in,     # new audio
+        "-c:v", "copy",     # do not re-encode video
+        "-map", "0:v:0",    # take video from input 0
+        "-map", "1:a:0",    # take audio from input 1
+        "-shortest",        # stop when either track ends
+        video_out,
+    ]
+    subprocess.run(cmd, check=True)
+    # IMPORTANT: return the 10s-max SRT for download
+    return video_out, srt_10s_path, audio_16k_in
+css = """
+    #col-container {
+        margin: 0 auto;
+        max-width: 1600px;
+    }
+    #modal-container {
+    width: 100vw;            /* Take full viewport width */
+    height: 100vh;           /* Take full viewport height (optional) */
+    display: flex;
+    justify-content: center; /* Center content horizontally */
+    align-items: center;     /* Center content vertically if desired */
+    }
+    #modal-content {
+    width: 100%;
+    max-width: 700px;         /* Limit content width */
+    margin: 0 auto;
+    border-radius: 8px;
+    padding: 1.5rem;
+    }
+    #step-column {
+        padding: 10px;
+        border-radius: 8px;
+        box-shadow: var(--card-shadow);
+        margin: 10px;
+    }
+    #col-showcase {
+        margin: 0 auto;
+        max-width: 1100px;
+    }
+    .button-gradient {
+        background: linear-gradient(45deg, rgb(255, 65, 108), rgb(255, 75, 43), rgb(255, 155, 0), rgb(255, 65, 108)) 0% 0% / 400% 400%;
+        border: none;
+        padding: 14px 28px;
+        font-size: 16px;
+        font-weight: bold;
+        color: white;
+        border-radius: 10px;
+        cursor: pointer;
+        transition: 0.3s ease-in-out;
+        animation: 2s linear 0s infinite normal none running gradientAnimation;
+        box-shadow: rgba(255, 65, 108, 0.6) 0px 4px 10px;
+    }
+    .toggle-container {
+    display: inline-flex;
+    background-color: #ffd6ff;  /* light pink background */
+    border-radius: 9999px;
+    padding: 4px;
+    position: relative;
+    width: fit-content;
+    font-family: sans-serif;
+    }
+    .toggle-container input[type="radio"] {
+    display: none;
+    }
+    .toggle-container label {
+    position: relative;
+    z-index: 2;
+    flex: 1;
+    text-align: center;
+    font-weight: 700;
+    color: #4b2ab5; /* dark purple text for unselected */
+    padding: 6px 22px;
+    border-radius: 9999px;
+    cursor: pointer;
+    transition: color 0.25s ease;
+    }
+    /* Moving highlight */
+    .toggle-highlight {
+    position: absolute;
+    top: 4px;
+    left: 4px;
+    width: calc(50% - 4px);
+    height: calc(100% - 8px);
+    background-color: #4b2ab5; /* dark purple background */
+    border-radius: 9999px;
+    transition: transform 0.25s ease;
+    z-index: 1;
+    }
+    /* When "True" is checked */
+    #true:checked ~ label[for="true"] {
+    color: #ffd6ff; /* light pink text */
+    }
+    /* When "False" is checked */
+    #false:checked ~ label[for="false"] {
+    color: #ffd6ff; /* light pink text */
+    }
+    /* Move highlight to right side when False is checked */
+    #false:checked ~ .toggle-highlight {
+    transform: translateX(100%);
+    }
+    """
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.HTML(
+            """
+            <div style="text-align: center;">
+                <p style="font-size:16px; display: inline; margin: 0;">
+                    <strong>OutofSync </strong>
+                </p>
+                <p style="font-size:16px; display: inline; margin: 0;">
+                    -- HF Space By:
+                </p>
+                <a href="https://huggingface.co/alexnasa" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                    <img src="https://img.shields.io/badge/🤗-Follow Me-yellow.svg">
+                </a>
+                <a href="https://www.buymeacoffee.com/outofai" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;" target="_blank"><img src="https://img.shields.io/badge/-buy_me_a%C2%A0coffee-red?logo=buy-me-a-coffee" alt="Buy Me A Coffee"></a>
+            </div>
+            """
+        )
+        with gr.Row():
+            with gr.Column(elem_id="step-column"):
+                gr.HTML("""
+                <div>
+                    <span style="font-size: 24px;">1. Upload a Video</span><br>
+                </div>
+                """)
+                video_input = gr.Video(
+                    label="OG Clip",
+                    height=512
+                )
+            with gr.Column(elem_id="step-column"):
+                gr.HTML("""
+                <div>
+                    <span style="font-size: 24px;">2. Translate + 💋 </span><br>
+                </div>
+                """)
+                video_output = gr.Video(label="Output", height=512)
+                lipsync = gr.Checkbox(label="Lipsync", value=False, visible=False)
+                duration = gr.Slider(0, 30, 30, step=10)
+                translate_btn = gr.Button("🤹‍♂️ Translate")
+                translate_lipsync_btn = gr.Button("🤹‍♂️ Translate + 💋 Lipsync", variant='primary', elem_classes="button-gradient")
+            with gr.Column(elem_id="step-column"):
+                vocal_16k_output = gr.File(label="Vocal 16k", visible=False)
+                srt_output = gr.File(label="Download translated diarized SRT", visible=False)
+                cached_examples = gr.Examples(
+                    examples=[
+                        [
+                            "assets/popup-2.mp4",
+                            False,
+                            10
+                        ],
+                        [
+                            "assets/popup-2.mp4",
+                            False,
+                            20
+                        ],
+                        [
+                            "assets/popup-2.mp4",
+                            False,
+                            30
+                        ],
+                        [
+                            "assets/german.mp4",
+                            True,
+                            10
+                        ],
+                        [
+                            "assets/popup-2.mp4",
+                            True,
+                            20
+                        ],
+                        [
+                            "assets/popup-2.mp4",
+                            True,
+                            30
+                        ],
+                        [
+                            "assets/popup-2.mp4",
+                            True,
+                            10
+                        ],
+                        [
+                            "assets/italian.mp4",
+                            True,
+                            10
+                        ],
+                        [
+                            "assets/french-movie.mp4",
+                            True,
+                            10
+                        ],
+                    ],
+                    label="Cached Examples",
+                    fn=process_video,
+                    inputs=[video_input, lipsync, duration],
+                    outputs=[video_output, srt_output, vocal_16k_output],
+                    cache_examples=True
+                    )
+    translate_btn.click(
+        fn=translate_video,
+        inputs=[video_input],
+        outputs=[video_output, srt_output, vocal_16k_output],
+    )
+    translate_lipsync_btn.click(
+        fn=translate_lipsync_video,
+        inputs=[video_input],
+        outputs=[video_output, srt_output, vocal_16k_output],
+    )
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch()

assets/french-ladder.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c7242b9627b46650a4189feaba08226eb31110a61fb19773dd8d813e7e8a2f4
+size 673836

assets/french-long.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb9d6be501d45cbd154dfa21e986bcf972aed1f6bf89ee99cceb2fdd03da3366
+size 1591201

assets/french-movie.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0d46088490d5928f632c78ccabd91e1b122268e7009eda4d0b0ffa5dd90f659
+size 610342

assets/german-5.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc3a75afe756c44aad4541c6ef2dc6ac2cfec9507594fe6cc7b005e4c8cea83a
+size 952730

assets/german.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab85fcfcccbd49ab59b2bc05a927c0ab5b6683ea34c0edee45f46b8472c87652
+size 387333

assets/italian.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc9b510c734b833d7502cb3e2f34f0b5c3018de6bb82cb531fd0ec4585a4d8a
+size 458458

assets/movie.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecbb2f43ff7614b1e825ab331fce19907bbb3343cef98e0dacf62ca9c4463ed4
+size 17398047

assets/noisy-french.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1884cd59445f274498698aaab1fc31706b10a7ef64016a779ac7dcef23f9d46
+size 902524

assets/old-french.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7029400cc0be92df96e23c668f72b589b2f2fe3eaa07bd9267d86cc8ccdbf5a
+size 373654

assets/popup-2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b706c1efe8bbbf50521280990011e401c81905005eebdb5ccd2d53392b677621
+size 77033055

assets/port.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99b0b97048d46629baf7e81b5efc0fbb6fdc5cd338e71dc9664a9aade1f5b1be
+size 3674780

assets/rus.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7df70f3d85dc8b1eca364b4665899efd24c8808f43113ef3b1412a2096ce796
+size 2288293

assets/spanish-2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e07fba8382d040f191e67fd847e62d31dc8a607b62bc5bdb2611c1f4998dce1a
+size 129943

assets/spanish-3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d87c8f424c78f531c8cb0ee4a7b6d9d0fc5aa19815bb7f0cac3a645305dcf757
+size 442383

assets/spanish.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6190dc37f372b4d9de0959acbfd89f72c5515169bd70a723b46e875d3a466627
+size 839466

assets/trolls.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e6a0fd5e6202a9e234a51c297d9a6db35a0fe87c813eb2c8599b568004f62b4
+size 9604495

checkpoints/2DFAN4-cd938726ad.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd938726adb1f15f361263cce2db9cb820c42585fa8796ec72ce19107f369a46
+size 96316515

checkpoints/mobilenet0.25_Final.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2979b33ffafda5d74b6948cd7a5b9a7a62f62b949cef24e95fd15d2883a65220
+size 1789735

checkpoints/mobilenet_224_model_best_gdconv_external.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:826b3c902e70e1eeb177f35c73198af0714f74502fe7bd3cdea42e847b1ca30f
+size 15239204

lipsync.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from outofsync.latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
+import torch
+pipeline = LipsyncPipeline(
+    scheduler=None,
+    vae=None,
+    audio_encoder=None,
+    unet=None,
+).to("cuda")
+def apply_lipsync(video_input_path, audio_path, video_out_path):
+    torch.manual_seed(1234)
+    print(f"Initial seed: {torch.initial_seed()}")
+    pipeline(
+        video_path=video_input_path,
+        audio_path=audio_path,
+        video_out_path=video_out_path,
+        video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
+        num_frames=16,
+        num_inference_steps=20,
+        guidance_scale=1.0,
+        weight_dtype=torch.float16,
+        width=256,
+        height=256,
+    )
+    return video_out_path

requirements.txt ADDED Viewed

	@@ -0,0 +1,74 @@

+# --- Core Torch Stack ---
+torchaudio==2.8.0
+torchvision==0.23.0
+triton
+deepspeed==0.17.1
+flash-attn-3 @ https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
+# --- Whisper / ASR / Audio Processing ---
+pyannote.audio
+ctranslate2
+faster-whisper[cuda12x]
+pydub==0.25.1
+srt
+audio-separator==0.24.1
+ffmpeg-python==0.2.0
+python_speech_features==0.6
+librosa==0.10.2.post1  # higher than 0.10.1
+# --- NLP / Transformers / ML Utils ---
+accelerate==1.8.1        # higher than 0.26.1
+transformers==4.52.3     # higher than 4.52.1
+tokenizers==0.21.0
+sentencepiece
+g2p-en==2.1.0
+cn2an==0.5.22
+textstat
+omegaconf==2.3.0          # explicit highest version
+munch==4.0.0
+tqdm
+json5==0.10.0
+# --- Vision / Diffusion / Media ---
+diffusers==0.33.1
+huggingface-hub<1.0
+imageio==2.27.0
+decord==0.6.0
+opencv-python==4.9.0.80   # same version, deduped
+mediapipe==0.10.11
+av
+torch-fidelity==0.3.0
+torchmetrics==1.3.1
+lpips==0.1.4
+face-alignment==1.4.1
+insightface==0.7.3
+kornia==0.8.0
+scenedetect==0.6.1
+moviepy==1.0.3
+# --- Numerical / Scientific ---
+numpy==1.26.2   # higher than 1.24.4
+pandas==2.1.3   # higher than 2.0.3
+matplotlib==3.8.2
+numba==0.58.1
+Cython==3.0.7
+einops==0.7.0
+ninja==1.11.1.1
+# --- Model Repos & Tools ---
+descript-audiotools @ git+https://github.com/descriptinc/audiotools.git
+tigersound @ git+https://github.com/OutofAi/tigersound.git
+tensorboard
+hf-xet==1.1.8
+DeepCache==0.1.1
+ultralytics
+batch-face
+modelscope==1.27.0
+onnxruntime-gpu==1.21.0
+# --- Language-specific packages ---
+jieba==0.42.1
+# --- Environment-specific ---
+WeTextProcessing; platform_machine != "Darwin"
+wetext; platform_system == "Darwin"

time_util.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import time
+from contextlib import contextmanager
+@contextmanager
+def timer(name: str):
+    start = time.time()
+    print(f"{name}...")
+    yield
+    print(f"  -> {name} completed in {time.time() - start:.2f} sec")