Eburon-Realtime-v1

Sleeping

App Files Files Community

aitekphsoftware commited on 12 days ago

Commit

230256d

verified ·

1 Parent(s): 46c1003

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -56

app.py CHANGED Viewed

@@ -15,29 +15,29 @@ EMOTION_PRESETS = {
         "pitch_offset": 0,
     },
     "Comedy / Playful (Taglish)": {
-        "rate_offset": 12,   # faster
-        "pitch_offset": 4,   # slightly brighter
     },
     "Storytelling / Warm": {
-        "rate_offset": -6,   # a bit slower
-        "pitch_offset": 2,   # slightly warmer
     },
     "Emotional / Heartfelt": {
-        "rate_offset": -10,  # slower
-        "pitch_offset": 5,   # a bit more open / emotional
     },
     "Angry / Rant": {
-        "rate_offset": 15,
-        "pitch_offset": 3,
     },
     "Sad / Dramatic": {
         "rate_offset": -18,
-        "pitch_offset": -5,
     },
 }
 # -----------------------------
-# Custom CSS – injected via <style> for Gradio 4.36.1+
 # -----------------------------
 EBURON_CSS = """
 body {
@@ -245,7 +245,7 @@ label span, .gr-textbox label, .gr-slider label, .gr-dropdown label {
     color: #9ca3af;
 }
-/* Warning styling (Gradio Alert) */
 .svelte-1g805jl {
     border-radius: 999px !important;
 }
@@ -261,28 +261,37 @@ label span, .gr-textbox label, .gr-slider label, .gr-dropdown label {
 """
 # -----------------------------
-# Helper: normalize expressive cues like [pause], [laugh]
 # -----------------------------
 def normalize_script_for_tts(text: str) -> str:
     """
     Convert expressive cues [pause], [laugh], [energetic intro] etc.
-    into punctuation so Edge TTS won't literally read the brackets.
     """
     def _repl(match: re.Match) -> str:
         cue = match.group(1).strip().lower()
-        if "pause" in cue or "beat" in cue:
-            return " … "
         if "laugh" in cue or "chuckle" in cue:
-            return " "
         if "intro" in cue or "outro" in cue:
             return " "
         if "soft" in cue or "whisper" in cue:
             return " "
-        # Default: remove bracket cue
         return " "
-    return re.sub(r"\[(.*?)\]", _repl, text)
 # -----------------------------
@@ -298,14 +307,14 @@ async def get_voices():
     return voice_labels
-async def text_to_speech(text, voice, rate, pitch, emotion, expressiveness):
     if not text.strip():
         return None, "Please enter some text to synthesize."
     if not voice:
         return None, "Please select a voice."
-    # Clean expressive brackets for TTS
     clean_text = normalize_script_for_tts(text)
     voice_short_name = voice.split(" - ")[0].strip()
@@ -314,15 +323,16 @@ async def text_to_speech(text, voice, rate, pitch, emotion, expressiveness):
     # Emotion → rate/pitch shaping
     # -----------------------------
     preset = EMOTION_PRESETS.get(emotion or "Neutral", EMOTION_PRESETS["Neutral"])
-    factor = max(0.0, float(expressiveness) / 100.0)  # 0 to 2.0
     rate_offset = int(preset["rate_offset"] * factor)
     pitch_offset = int(preset["pitch_offset"] * factor)
-    eff_rate = int(rate + rate_offset)
-    eff_pitch = int(pitch + pitch_offset)
-    # Clamp into slider ranges
     eff_rate = max(-50, min(50, eff_rate))
     eff_pitch = max(-20, min(20, eff_pitch))
@@ -343,12 +353,12 @@ async def text_to_speech(text, voice, rate, pitch, emotion, expressiveness):
     return tmp_path, None
-async def tts_interface(text, voice, rate, pitch, emotion, expressiveness):
     audio, warning = await text_to_speech(
         text=text,
         voice=voice,
-        rate=rate,
-        pitch=pitch,
         emotion=emotion,
         expressiveness=expressiveness,
     )
@@ -358,20 +368,21 @@ async def tts_interface(text, voice, rate, pitch, emotion, expressiveness):
 # -----------------------------
-# Eburon Speech Studio v1.8 – ElevenLabs-style UI
 # -----------------------------
 async def create_demo():
     voices = await get_voices()
-    # Sample Taglish Alex Calleja–style comedy script as default
     sample_script = (
         "[energetic intro]\n"
-        "Magandang gabi sa inyong lahat! Ako nga pala si Alex… hindi Calleja, pero pwede na rin sa murang kopya. "
-        "[pause] Parang Shopee version ng Netflix special.\n\n"
         "[conversational]\n"
-        "Alam n’yo, mahirap na maging adult ngayon. Nung bata tayo, gusto natin tumanda para “walang mag-uutos”. "
-        "Ngayon, tumanda tayo… at ang pinaka-maingay mag-utos: BILLS. [pause]\n"
-        "Kuryente, tubig, WiFi, GCash utang, BNPL… parang ex na hindi makamove on. Laging bumabalik buwan-buwan.\n\n"
         "[teasing tone]\n"
         "Tapos ‘yung kuryente, grabe. Kahit wala ka sa bahay, mataas pa rin bill. "
         "Parang Meralco, nag-a-assume: “Alam naming may iyak ka sa dilim, may load ‘yan sa emosyon.” [laugh]\n\n"
@@ -391,7 +402,7 @@ async def create_demo():
     )
     with gr.Blocks(title="Eburon Speech Studio v1.8") as demo:
-        # Inject CSS (works in Gradio 4.36.1 and 6+)
         gr.HTML(f"<style>{EBURON_CSS}</style>", elem_id="eburon-style-inject")
         with gr.Column(elem_id="eburon-root"):
@@ -423,7 +434,7 @@ async def create_demo():
                 """
             )
-            # Main: script + voice/emotion
             with gr.Row():
                 # Left: Script
                 with gr.Column(scale=2, min_width=460):
@@ -434,11 +445,11 @@ async def create_demo():
                                 <div>
                                     <div class="eburon-section-title">Script</div>
                                     <div class="eburon-section-subtitle">
-                                        Taglish Alex Calleja–style skit with expressive cues like [pause], [laugh], [energetic intro].
                                     </div>
                                 </div>
                                 <div class="eburon-mini-pill">
-                                    Bracket cues are auto-cleaned before TTS
                                 </div>
                             </div>
                             """
@@ -446,11 +457,10 @@ async def create_demo():
                         text_input = gr.Textbox(
                             label="",
                             value=sample_script,
-                            placeholder="Write your expressive Taglish skit here...",
-                            lines=14,
                         )
-                # Right: Voice & expressive controls
                 with gr.Column(scale=1, min_width=340):
                     with gr.Group(elem_classes="eburon-main-card"):
                         gr.HTML(
@@ -459,7 +469,7 @@ async def create_demo():
                                 <div>
                                     <div class="eburon-section-title">Voice & Delivery</div>
                                     <div class="eburon-section-subtitle">
-                                        ElevenLabs v3 style: emotion preset + intensity + fine speed & pitch.
                                     </div>
                                 </div>
                                 <div class="eburon-mini-pill">
@@ -473,41 +483,41 @@ async def create_demo():
                             choices=[""] + voices,
                             label="Voice",
                             value="",
-                            info="Pick a neural voice from the Edge TTS catalog.",
                         )
-                        rate_slider = gr.Slider(
                             minimum=-50,
                             maximum=50,
-                            value=0,
                             label="Base Speed",
                             step=1,
-                            info="Manual speed baseline. Emotion will adjust on top of this.",
                         )
-                        pitch_slider = gr.Slider(
                             minimum=-20,
                             maximum=20,
-                            value=0,
                             label="Base Pitch",
                             step=1,
-                            info="Manual pitch baseline. Emotion will adjust on top of this.",
                         )
                         emotion_dropdown = gr.Dropdown(
                             label="Emotion preset",
                             choices=list(EMOTION_PRESETS.keys()),
                             value="Comedy / Playful (Taglish)",
-                            info="High-level emotional profile similar to ElevenLabs v3.",
                         )
                         expressiveness_slider = gr.Slider(
                             minimum=0,
                             maximum=200,
-                            value=100,
                             step=5,
                             label="Expressiveness (intensity)",
-                            info="0 = off, 100 = normal, 200 = max emotional shaping.",
                         )
             # Bottom: Generate + audio
@@ -528,7 +538,7 @@ async def create_demo():
                                 <div>
                                     <div id="eburon-audio-title">Latest generation</div>
                                     <div id="eburon-audio-subtitle">
-                                        Auto-plays after each emotional render. Make sure your browser allows audio.
                                     </div>
                                 </div>
                                 <div class="eburon-mini-pill">
@@ -549,8 +559,8 @@ async def create_demo():
                 inputs=[
                     text_input,
                     voice_dropdown,
-                    rate_slider,
-                    pitch_slider,
                     emotion_dropdown,
                     expressiveness_slider,
                 ],
@@ -563,7 +573,7 @@ async def create_demo():
 async def main():
     demo = await create_demo()
     demo.queue(default_concurrency_limit=50)
-    demo.launch()  # Compatible with gradio==4.36.1
 if __name__ == "__main__":

         "pitch_offset": 0,
     },
     "Comedy / Playful (Taglish)": {
+        "rate_offset": 14,   # faster
+        "pitch_offset": 6,   # brighter
     },
     "Storytelling / Warm": {
+        "rate_offset": -4,
+        "pitch_offset": 3,
     },
     "Emotional / Heartfelt": {
+        "rate_offset": -10,
+        "pitch_offset": 6,
     },
     "Angry / Rant": {
+        "rate_offset": 16,
+        "pitch_offset": 4,
     },
     "Sad / Dramatic": {
         "rate_offset": -18,
+        "pitch_offset": -4,
     },
 }
 # -----------------------------
+# Custom CSS (works with gradio==4.36.1)
 # -----------------------------
 EBURON_CSS = """
 body {
     color: #9ca3af;
 }
+/* Warning styling (Markdown used as banner) */
 .svelte-1g805jl {
     border-radius: 999px !important;
 }
 """
 # -----------------------------
+# Helper: map [pause], [laugh] etc to more natural text
 # -----------------------------
 def normalize_script_for_tts(text: str) -> str:
     """
     Convert expressive cues [pause], [laugh], [energetic intro] etc.
+    into punctuation and "ha ha ha" so Edge TTS doesn't read brackets
+    but still sounds more like a comedian.
     """
     def _repl(match: re.Match) -> str:
         cue = match.group(1).strip().lower()
+        if "pause" in cue or "beat" in cue or "silence" in cue:
+            return "... "
         if "laugh" in cue or "chuckle" in cue:
+            # let TTS actually say "ha ha ha" to mimic a laugh
+            return " ha ha ha, "
         if "intro" in cue or "outro" in cue:
             return " "
         if "soft" in cue or "whisper" in cue:
             return " "
+        if "energetic" in cue or "teasing" in cue or "conversational" in cue:
+            return " "
+        # default: just drop bracket cue
         return " "
+    # Replace bracketed cues
+    out = re.sub(r"\[(.*?)\]", _repl, text)
+    # Compress any crazy spacing
+    out = re.sub(r"\s+", " ", out)
+    return out.strip()
 # -----------------------------
     return voice_labels
+async def text_to_speech(text, voice, base_rate, base_pitch, emotion, expressiveness):
     if not text.strip():
         return None, "Please enter some text to synthesize."
     if not voice:
         return None, "Please select a voice."
+    # Clean expressive brackets into comedy-friendly text
     clean_text = normalize_script_for_tts(text)
     voice_short_name = voice.split(" - ")[0].strip()
     # Emotion → rate/pitch shaping
     # -----------------------------
     preset = EMOTION_PRESETS.get(emotion or "Neutral", EMOTION_PRESETS["Neutral"])
+    # 0 = no emotion, 100 = base, 200 = 2x preset strength
+    factor = max(0.0, float(expressiveness) / 100.0)
     rate_offset = int(preset["rate_offset"] * factor)
     pitch_offset = int(preset["pitch_offset"] * factor)
+    eff_rate = int(base_rate + rate_offset)
+    eff_pitch = int(base_pitch + pitch_offset)
+    # Clamp into safe ranges
     eff_rate = max(-50, min(50, eff_rate))
     eff_pitch = max(-20, min(20, eff_pitch))
     return tmp_path, None
+async def tts_interface(text, voice, base_rate, base_pitch, emotion, expressiveness):
     audio, warning = await text_to_speech(
         text=text,
         voice=voice,
+        base_rate=base_rate,
+        base_pitch=base_pitch,
         emotion=emotion,
         expressiveness=expressiveness,
     )
 # -----------------------------
+# Eburon Speech Studio v1.8 – ElevenLabs-ish expressive UI
 # -----------------------------
 async def create_demo():
     voices = await get_voices()
+    # Sample Taglish Alex Calleja–style comedy script
     sample_script = (
         "[energetic intro]\n"
+        "Magandang gabi sa inyong lahat! Ako nga pala si Alex… hindi Calleja, "
+        "pero pwede na rin sa murang kopya. [pause] Parang Shopee version ng Netflix special.\n\n"
         "[conversational]\n"
+        "Alam n’yo, mahirap na maging adult ngayon. Nung bata tayo, gusto natin tumanda para "
+        "“walang mag-uutos”. Ngayon, tumanda tayo… at ang pinaka-maingay mag-utos: BILLS. [pause]\n"
+        "Kuryente, tubig, WiFi, GCash utang, BNPL… parang ex na hindi makamove on. "
+        "Laging bumabalik buwan-buwan.\n\n"
         "[teasing tone]\n"
         "Tapos ‘yung kuryente, grabe. Kahit wala ka sa bahay, mataas pa rin bill. "
         "Parang Meralco, nag-a-assume: “Alam naming may iyak ka sa dilim, may load ‘yan sa emosyon.” [laugh]\n\n"
     )
     with gr.Blocks(title="Eburon Speech Studio v1.8") as demo:
+        # Inject CSS
         gr.HTML(f"<style>{EBURON_CSS}</style>", elem_id="eburon-style-inject")
         with gr.Column(elem_id="eburon-root"):
                 """
             )
+            # Main body
             with gr.Row():
                 # Left: Script
                 with gr.Column(scale=2, min_width=460):
                                 <div>
                                     <div class="eburon-section-title">Script</div>
                                     <div class="eburon-section-subtitle">
+                                        Taglish Alex Calleja–style skit with cues like [pause], [laugh], [energetic intro].
                                     </div>
                                 </div>
                                 <div class="eburon-mini-pill">
+                                    Cues auto-converted to pauses & “ha ha ha”
                                 </div>
                             </div>
                             """
                         text_input = gr.Textbox(
                             label="",
                             value=sample_script,
+                            lines=16,
                         )
+                # Right: Voice & emotion
                 with gr.Column(scale=1, min_width=340):
                     with gr.Group(elem_classes="eburon-main-card"):
                         gr.HTML(
                                 <div>
                                     <div class="eburon-section-title">Voice & Delivery</div>
                                     <div class="eburon-section-subtitle">
+                                        Emotion preset + intensity + fine speed & pitch (ElevenLabs-style).
                                     </div>
                                 </div>
                                 <div class="eburon-mini-pill">
                             choices=[""] + voices,
                             label="Voice",
                             value="",
+                            info="Tip: pick a lively EN voice (e.g. male) for stand-up style.",
                         )
+                        base_rate_slider = gr.Slider(
                             minimum=-50,
                             maximum=50,
+                            value=5,  # slightly faster by default
                             label="Base Speed",
                             step=1,
+                            info="Baseline speaking speed. Emotion will adjust on top.",
                         )
+                        base_pitch_slider = gr.Slider(
                             minimum=-20,
                             maximum=20,
+                            value=2,  # slightly brighter by default
                             label="Base Pitch",
                             step=1,
+                            info="Baseline pitch. Emotion will adjust on top.",
                         )
                         emotion_dropdown = gr.Dropdown(
                             label="Emotion preset",
                             choices=list(EMOTION_PRESETS.keys()),
                             value="Comedy / Playful (Taglish)",
+                            info="High-level emotional profile (approximate, using rate+pitch).",
                         )
                         expressiveness_slider = gr.Slider(
                             minimum=0,
                             maximum=200,
+                            value=130,  # a bit stronger than normal
                             step=5,
                             label="Expressiveness (intensity)",
+                            info="0 = off, 100 = normal, 200 = stronger emotion.",
                         )
             # Bottom: Generate + audio
                                 <div>
                                     <div id="eburon-audio-title">Latest generation</div>
                                     <div id="eburon-audio-subtitle">
+                                        Auto-plays after each render. Browser must allow audio playback.
                                     </div>
                                 </div>
                                 <div class="eburon-mini-pill">
                 inputs=[
                     text_input,
                     voice_dropdown,
+                    base_rate_slider,
+                    base_pitch_slider,
                     emotion_dropdown,
                     expressiveness_slider,
                 ],
 async def main():
     demo = await create_demo()
     demo.queue(default_concurrency_limit=50)
+    demo.launch()
 if __name__ == "__main__":