aitekphsoftware commited on
Commit
230256d
·
verified ·
1 Parent(s): 46c1003

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -56
app.py CHANGED
@@ -15,29 +15,29 @@ EMOTION_PRESETS = {
15
  "pitch_offset": 0,
16
  },
17
  "Comedy / Playful (Taglish)": {
18
- "rate_offset": 12, # faster
19
- "pitch_offset": 4, # slightly brighter
20
  },
21
  "Storytelling / Warm": {
22
- "rate_offset": -6, # a bit slower
23
- "pitch_offset": 2, # slightly warmer
24
  },
25
  "Emotional / Heartfelt": {
26
- "rate_offset": -10, # slower
27
- "pitch_offset": 5, # a bit more open / emotional
28
  },
29
  "Angry / Rant": {
30
- "rate_offset": 15,
31
- "pitch_offset": 3,
32
  },
33
  "Sad / Dramatic": {
34
  "rate_offset": -18,
35
- "pitch_offset": -5,
36
  },
37
  }
38
 
39
  # -----------------------------
40
- # Custom CSS injected via <style> for Gradio 4.36.1+
41
  # -----------------------------
42
  EBURON_CSS = """
43
  body {
@@ -245,7 +245,7 @@ label span, .gr-textbox label, .gr-slider label, .gr-dropdown label {
245
  color: #9ca3af;
246
  }
247
 
248
- /* Warning styling (Gradio Alert) */
249
  .svelte-1g805jl {
250
  border-radius: 999px !important;
251
  }
@@ -261,28 +261,37 @@ label span, .gr-textbox label, .gr-slider label, .gr-dropdown label {
261
  """
262
 
263
  # -----------------------------
264
- # Helper: normalize expressive cues like [pause], [laugh]
265
  # -----------------------------
266
  def normalize_script_for_tts(text: str) -> str:
267
  """
268
  Convert expressive cues [pause], [laugh], [energetic intro] etc.
269
- into punctuation so Edge TTS won't literally read the brackets.
 
270
  """
271
 
272
  def _repl(match: re.Match) -> str:
273
  cue = match.group(1).strip().lower()
274
- if "pause" in cue or "beat" in cue:
275
- return " "
 
276
  if "laugh" in cue or "chuckle" in cue:
277
- return " "
 
278
  if "intro" in cue or "outro" in cue:
279
  return " "
280
  if "soft" in cue or "whisper" in cue:
281
  return " "
282
- # Default: remove bracket cue
 
 
283
  return " "
284
 
285
- return re.sub(r"\[(.*?)\]", _repl, text)
 
 
 
 
286
 
287
 
288
  # -----------------------------
@@ -298,14 +307,14 @@ async def get_voices():
298
  return voice_labels
299
 
300
 
301
- async def text_to_speech(text, voice, rate, pitch, emotion, expressiveness):
302
  if not text.strip():
303
  return None, "Please enter some text to synthesize."
304
 
305
  if not voice:
306
  return None, "Please select a voice."
307
 
308
- # Clean expressive brackets for TTS
309
  clean_text = normalize_script_for_tts(text)
310
 
311
  voice_short_name = voice.split(" - ")[0].strip()
@@ -314,15 +323,16 @@ async def text_to_speech(text, voice, rate, pitch, emotion, expressiveness):
314
  # Emotion → rate/pitch shaping
315
  # -----------------------------
316
  preset = EMOTION_PRESETS.get(emotion or "Neutral", EMOTION_PRESETS["Neutral"])
317
- factor = max(0.0, float(expressiveness) / 100.0) # 0 to 2.0
 
318
 
319
  rate_offset = int(preset["rate_offset"] * factor)
320
  pitch_offset = int(preset["pitch_offset"] * factor)
321
 
322
- eff_rate = int(rate + rate_offset)
323
- eff_pitch = int(pitch + pitch_offset)
324
 
325
- # Clamp into slider ranges
326
  eff_rate = max(-50, min(50, eff_rate))
327
  eff_pitch = max(-20, min(20, eff_pitch))
328
 
@@ -343,12 +353,12 @@ async def text_to_speech(text, voice, rate, pitch, emotion, expressiveness):
343
  return tmp_path, None
344
 
345
 
346
- async def tts_interface(text, voice, rate, pitch, emotion, expressiveness):
347
  audio, warning = await text_to_speech(
348
  text=text,
349
  voice=voice,
350
- rate=rate,
351
- pitch=pitch,
352
  emotion=emotion,
353
  expressiveness=expressiveness,
354
  )
@@ -358,20 +368,21 @@ async def tts_interface(text, voice, rate, pitch, emotion, expressiveness):
358
 
359
 
360
  # -----------------------------
361
- # Eburon Speech Studio v1.8 – ElevenLabs-style UI
362
  # -----------------------------
363
  async def create_demo():
364
  voices = await get_voices()
365
 
366
- # Sample Taglish Alex Calleja–style comedy script as default
367
  sample_script = (
368
  "[energetic intro]\n"
369
- "Magandang gabi sa inyong lahat! Ako nga pala si Alex… hindi Calleja, pero pwede na rin sa murang kopya. "
370
- "[pause] Parang Shopee version ng Netflix special.\n\n"
371
  "[conversational]\n"
372
- "Alam n’yo, mahirap na maging adult ngayon. Nung bata tayo, gusto natin tumanda para “walang mag-uutos”. "
373
- "Ngayon, tumanda tayo… at ang pinaka-maingay mag-utos: BILLS. [pause]\n"
374
- "Kuryente, tubig, WiFi, GCash utang, BNPL… parang ex na hindi makamove on. Laging bumabalik buwan-buwan.\n\n"
 
375
  "[teasing tone]\n"
376
  "Tapos ‘yung kuryente, grabe. Kahit wala ka sa bahay, mataas pa rin bill. "
377
  "Parang Meralco, nag-a-assume: “Alam naming may iyak ka sa dilim, may load ‘yan sa emosyon.” [laugh]\n\n"
@@ -391,7 +402,7 @@ async def create_demo():
391
  )
392
 
393
  with gr.Blocks(title="Eburon Speech Studio v1.8") as demo:
394
- # Inject CSS (works in Gradio 4.36.1 and 6+)
395
  gr.HTML(f"<style>{EBURON_CSS}</style>", elem_id="eburon-style-inject")
396
 
397
  with gr.Column(elem_id="eburon-root"):
@@ -423,7 +434,7 @@ async def create_demo():
423
  """
424
  )
425
 
426
- # Main: script + voice/emotion
427
  with gr.Row():
428
  # Left: Script
429
  with gr.Column(scale=2, min_width=460):
@@ -434,11 +445,11 @@ async def create_demo():
434
  <div>
435
  <div class="eburon-section-title">Script</div>
436
  <div class="eburon-section-subtitle">
437
- Taglish Alex Calleja–style skit with expressive cues like [pause], [laugh], [energetic intro].
438
  </div>
439
  </div>
440
  <div class="eburon-mini-pill">
441
- Bracket cues are auto-cleaned before TTS
442
  </div>
443
  </div>
444
  """
@@ -446,11 +457,10 @@ async def create_demo():
446
  text_input = gr.Textbox(
447
  label="",
448
  value=sample_script,
449
- placeholder="Write your expressive Taglish skit here...",
450
- lines=14,
451
  )
452
 
453
- # Right: Voice & expressive controls
454
  with gr.Column(scale=1, min_width=340):
455
  with gr.Group(elem_classes="eburon-main-card"):
456
  gr.HTML(
@@ -459,7 +469,7 @@ async def create_demo():
459
  <div>
460
  <div class="eburon-section-title">Voice & Delivery</div>
461
  <div class="eburon-section-subtitle">
462
- ElevenLabs v3 style: emotion preset + intensity + fine speed & pitch.
463
  </div>
464
  </div>
465
  <div class="eburon-mini-pill">
@@ -473,41 +483,41 @@ async def create_demo():
473
  choices=[""] + voices,
474
  label="Voice",
475
  value="",
476
- info="Pick a neural voice from the Edge TTS catalog.",
477
  )
478
 
479
- rate_slider = gr.Slider(
480
  minimum=-50,
481
  maximum=50,
482
- value=0,
483
  label="Base Speed",
484
  step=1,
485
- info="Manual speed baseline. Emotion will adjust on top of this.",
486
  )
487
 
488
- pitch_slider = gr.Slider(
489
  minimum=-20,
490
  maximum=20,
491
- value=0,
492
  label="Base Pitch",
493
  step=1,
494
- info="Manual pitch baseline. Emotion will adjust on top of this.",
495
  )
496
 
497
  emotion_dropdown = gr.Dropdown(
498
  label="Emotion preset",
499
  choices=list(EMOTION_PRESETS.keys()),
500
  value="Comedy / Playful (Taglish)",
501
- info="High-level emotional profile similar to ElevenLabs v3.",
502
  )
503
 
504
  expressiveness_slider = gr.Slider(
505
  minimum=0,
506
  maximum=200,
507
- value=100,
508
  step=5,
509
  label="Expressiveness (intensity)",
510
- info="0 = off, 100 = normal, 200 = max emotional shaping.",
511
  )
512
 
513
  # Bottom: Generate + audio
@@ -528,7 +538,7 @@ async def create_demo():
528
  <div>
529
  <div id="eburon-audio-title">Latest generation</div>
530
  <div id="eburon-audio-subtitle">
531
- Auto-plays after each emotional render. Make sure your browser allows audio.
532
  </div>
533
  </div>
534
  <div class="eburon-mini-pill">
@@ -549,8 +559,8 @@ async def create_demo():
549
  inputs=[
550
  text_input,
551
  voice_dropdown,
552
- rate_slider,
553
- pitch_slider,
554
  emotion_dropdown,
555
  expressiveness_slider,
556
  ],
@@ -563,7 +573,7 @@ async def create_demo():
563
  async def main():
564
  demo = await create_demo()
565
  demo.queue(default_concurrency_limit=50)
566
- demo.launch() # Compatible with gradio==4.36.1
567
 
568
 
569
  if __name__ == "__main__":
 
15
  "pitch_offset": 0,
16
  },
17
  "Comedy / Playful (Taglish)": {
18
+ "rate_offset": 14, # faster
19
+ "pitch_offset": 6, # brighter
20
  },
21
  "Storytelling / Warm": {
22
+ "rate_offset": -4,
23
+ "pitch_offset": 3,
24
  },
25
  "Emotional / Heartfelt": {
26
+ "rate_offset": -10,
27
+ "pitch_offset": 6,
28
  },
29
  "Angry / Rant": {
30
+ "rate_offset": 16,
31
+ "pitch_offset": 4,
32
  },
33
  "Sad / Dramatic": {
34
  "rate_offset": -18,
35
+ "pitch_offset": -4,
36
  },
37
  }
38
 
39
  # -----------------------------
40
+ # Custom CSS (works with gradio==4.36.1)
41
  # -----------------------------
42
  EBURON_CSS = """
43
  body {
 
245
  color: #9ca3af;
246
  }
247
 
248
+ /* Warning styling (Markdown used as banner) */
249
  .svelte-1g805jl {
250
  border-radius: 999px !important;
251
  }
 
261
  """
262
 
263
  # -----------------------------
264
+ # Helper: map [pause], [laugh] etc to more natural text
265
  # -----------------------------
266
  def normalize_script_for_tts(text: str) -> str:
267
  """
268
  Convert expressive cues [pause], [laugh], [energetic intro] etc.
269
+ into punctuation and "ha ha ha" so Edge TTS doesn't read brackets
270
+ but still sounds more like a comedian.
271
  """
272
 
273
  def _repl(match: re.Match) -> str:
274
  cue = match.group(1).strip().lower()
275
+
276
+ if "pause" in cue or "beat" in cue or "silence" in cue:
277
+ return "... "
278
  if "laugh" in cue or "chuckle" in cue:
279
+ # let TTS actually say "ha ha ha" to mimic a laugh
280
+ return " ha ha ha, "
281
  if "intro" in cue or "outro" in cue:
282
  return " "
283
  if "soft" in cue or "whisper" in cue:
284
  return " "
285
+ if "energetic" in cue or "teasing" in cue or "conversational" in cue:
286
+ return " "
287
+ # default: just drop bracket cue
288
  return " "
289
 
290
+ # Replace bracketed cues
291
+ out = re.sub(r"\[(.*?)\]", _repl, text)
292
+ # Compress any crazy spacing
293
+ out = re.sub(r"\s+", " ", out)
294
+ return out.strip()
295
 
296
 
297
  # -----------------------------
 
307
  return voice_labels
308
 
309
 
310
+ async def text_to_speech(text, voice, base_rate, base_pitch, emotion, expressiveness):
311
  if not text.strip():
312
  return None, "Please enter some text to synthesize."
313
 
314
  if not voice:
315
  return None, "Please select a voice."
316
 
317
+ # Clean expressive brackets into comedy-friendly text
318
  clean_text = normalize_script_for_tts(text)
319
 
320
  voice_short_name = voice.split(" - ")[0].strip()
 
323
  # Emotion → rate/pitch shaping
324
  # -----------------------------
325
  preset = EMOTION_PRESETS.get(emotion or "Neutral", EMOTION_PRESETS["Neutral"])
326
+ # 0 = no emotion, 100 = base, 200 = 2x preset strength
327
+ factor = max(0.0, float(expressiveness) / 100.0)
328
 
329
  rate_offset = int(preset["rate_offset"] * factor)
330
  pitch_offset = int(preset["pitch_offset"] * factor)
331
 
332
+ eff_rate = int(base_rate + rate_offset)
333
+ eff_pitch = int(base_pitch + pitch_offset)
334
 
335
+ # Clamp into safe ranges
336
  eff_rate = max(-50, min(50, eff_rate))
337
  eff_pitch = max(-20, min(20, eff_pitch))
338
 
 
353
  return tmp_path, None
354
 
355
 
356
+ async def tts_interface(text, voice, base_rate, base_pitch, emotion, expressiveness):
357
  audio, warning = await text_to_speech(
358
  text=text,
359
  voice=voice,
360
+ base_rate=base_rate,
361
+ base_pitch=base_pitch,
362
  emotion=emotion,
363
  expressiveness=expressiveness,
364
  )
 
368
 
369
 
370
  # -----------------------------
371
+ # Eburon Speech Studio v1.8 – ElevenLabs-ish expressive UI
372
  # -----------------------------
373
  async def create_demo():
374
  voices = await get_voices()
375
 
376
+ # Sample Taglish Alex Calleja–style comedy script
377
  sample_script = (
378
  "[energetic intro]\n"
379
+ "Magandang gabi sa inyong lahat! Ako nga pala si Alex… hindi Calleja, "
380
+ "pero pwede na rin sa murang kopya. [pause] Parang Shopee version ng Netflix special.\n\n"
381
  "[conversational]\n"
382
+ "Alam n’yo, mahirap na maging adult ngayon. Nung bata tayo, gusto natin tumanda para "
383
+ "“walang mag-uutos”. Ngayon, tumanda tayo… at ang pinaka-maingay mag-utos: BILLS. [pause]\n"
384
+ "Kuryente, tubig, WiFi, GCash utang, BNPL… parang ex na hindi makamove on. "
385
+ "Laging bumabalik buwan-buwan.\n\n"
386
  "[teasing tone]\n"
387
  "Tapos ‘yung kuryente, grabe. Kahit wala ka sa bahay, mataas pa rin bill. "
388
  "Parang Meralco, nag-a-assume: “Alam naming may iyak ka sa dilim, may load ‘yan sa emosyon.” [laugh]\n\n"
 
402
  )
403
 
404
  with gr.Blocks(title="Eburon Speech Studio v1.8") as demo:
405
+ # Inject CSS
406
  gr.HTML(f"<style>{EBURON_CSS}</style>", elem_id="eburon-style-inject")
407
 
408
  with gr.Column(elem_id="eburon-root"):
 
434
  """
435
  )
436
 
437
+ # Main body
438
  with gr.Row():
439
  # Left: Script
440
  with gr.Column(scale=2, min_width=460):
 
445
  <div>
446
  <div class="eburon-section-title">Script</div>
447
  <div class="eburon-section-subtitle">
448
+ Taglish Alex Calleja–style skit with cues like [pause], [laugh], [energetic intro].
449
  </div>
450
  </div>
451
  <div class="eburon-mini-pill">
452
+ Cues auto-converted to pauses & “ha ha ha”
453
  </div>
454
  </div>
455
  """
 
457
  text_input = gr.Textbox(
458
  label="",
459
  value=sample_script,
460
+ lines=16,
 
461
  )
462
 
463
+ # Right: Voice & emotion
464
  with gr.Column(scale=1, min_width=340):
465
  with gr.Group(elem_classes="eburon-main-card"):
466
  gr.HTML(
 
469
  <div>
470
  <div class="eburon-section-title">Voice & Delivery</div>
471
  <div class="eburon-section-subtitle">
472
+ Emotion preset + intensity + fine speed & pitch (ElevenLabs-style).
473
  </div>
474
  </div>
475
  <div class="eburon-mini-pill">
 
483
  choices=[""] + voices,
484
  label="Voice",
485
  value="",
486
+ info="Tip: pick a lively EN voice (e.g. male) for stand-up style.",
487
  )
488
 
489
+ base_rate_slider = gr.Slider(
490
  minimum=-50,
491
  maximum=50,
492
+ value=5, # slightly faster by default
493
  label="Base Speed",
494
  step=1,
495
+ info="Baseline speaking speed. Emotion will adjust on top.",
496
  )
497
 
498
+ base_pitch_slider = gr.Slider(
499
  minimum=-20,
500
  maximum=20,
501
+ value=2, # slightly brighter by default
502
  label="Base Pitch",
503
  step=1,
504
+ info="Baseline pitch. Emotion will adjust on top.",
505
  )
506
 
507
  emotion_dropdown = gr.Dropdown(
508
  label="Emotion preset",
509
  choices=list(EMOTION_PRESETS.keys()),
510
  value="Comedy / Playful (Taglish)",
511
+ info="High-level emotional profile (approximate, using rate+pitch).",
512
  )
513
 
514
  expressiveness_slider = gr.Slider(
515
  minimum=0,
516
  maximum=200,
517
+ value=130, # a bit stronger than normal
518
  step=5,
519
  label="Expressiveness (intensity)",
520
+ info="0 = off, 100 = normal, 200 = stronger emotion.",
521
  )
522
 
523
  # Bottom: Generate + audio
 
538
  <div>
539
  <div id="eburon-audio-title">Latest generation</div>
540
  <div id="eburon-audio-subtitle">
541
+ Auto-plays after each render. Browser must allow audio playback.
542
  </div>
543
  </div>
544
  <div class="eburon-mini-pill">
 
559
  inputs=[
560
  text_input,
561
  voice_dropdown,
562
+ base_rate_slider,
563
+ base_pitch_slider,
564
  emotion_dropdown,
565
  expressiveness_slider,
566
  ],
 
573
  async def main():
574
  demo = await create_demo()
575
  demo.queue(default_concurrency_limit=50)
576
+ demo.launch()
577
 
578
 
579
  if __name__ == "__main__":