Spaces:
Paused
Paused
Add Buffering to Avoid Speech Gaps due to Orca Slowdown
Browse files- app.py +9 -5
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -69,7 +69,7 @@ def response(state: AppState, audio: tuple):
|
|
| 69 |
if not audio:
|
| 70 |
return AppState()
|
| 71 |
|
| 72 |
-
file_name = f"
|
| 73 |
|
| 74 |
sf.write(file_name, audio[1], audio[0], format="wav")
|
| 75 |
|
|
@@ -103,7 +103,8 @@ def response(state: AppState, audio: tuple):
|
|
| 103 |
state.model_outs = None
|
| 104 |
prev_outs = causal_outs
|
| 105 |
stream = orca.stream_open()
|
| 106 |
-
|
|
|
|
| 107 |
for resp, outs in diva_audio(
|
| 108 |
(audio[0], audio[1]),
|
| 109 |
prev_outs=(prev_outs if prev_outs is not None else None),
|
|
@@ -112,15 +113,18 @@ def response(state: AppState, audio: tuple):
|
|
| 112 |
if prev_resp == LOADER_STR:
|
| 113 |
prev_resp = ""
|
| 114 |
state.conversation[-1]["content"] = resp
|
| 115 |
-
pcm = stream.synthesize(resp[len(prev_resp) :])
|
| 116 |
audio_chunk = None
|
|
|
|
| 117 |
if pcm is not None:
|
|
|
|
|
|
|
| 118 |
mp3_io = io.BytesIO()
|
| 119 |
sf.write(
|
| 120 |
-
mp3_io, np.asarray(
|
| 121 |
)
|
| 122 |
audio_chunk = mp3_io.getvalue()
|
| 123 |
mp3_io.close()
|
|
|
|
| 124 |
yield state, state.conversation, audio_chunk
|
| 125 |
|
| 126 |
del outs.logits
|
|
@@ -256,4 +260,4 @@ with gr.Blocks(theme=theme, js=js) as demo:
|
|
| 256 |
)
|
| 257 |
|
| 258 |
if __name__ == "__main__":
|
| 259 |
-
demo.launch()
|
|
|
|
| 69 |
if not audio:
|
| 70 |
return AppState()
|
| 71 |
|
| 72 |
+
file_name = f"./{xxhash.xxh32(bytes(audio[1])).hexdigest()}.wav"
|
| 73 |
|
| 74 |
sf.write(file_name, audio[1], audio[0], format="wav")
|
| 75 |
|
|
|
|
| 103 |
state.model_outs = None
|
| 104 |
prev_outs = causal_outs
|
| 105 |
stream = orca.stream_open()
|
| 106 |
+
i = 0
|
| 107 |
+
buff = []
|
| 108 |
for resp, outs in diva_audio(
|
| 109 |
(audio[0], audio[1]),
|
| 110 |
prev_outs=(prev_outs if prev_outs is not None else None),
|
|
|
|
| 113 |
if prev_resp == LOADER_STR:
|
| 114 |
prev_resp = ""
|
| 115 |
state.conversation[-1]["content"] = resp
|
|
|
|
| 116 |
audio_chunk = None
|
| 117 |
+
pcm = stream.synthesize(resp[len(prev_resp) :])
|
| 118 |
if pcm is not None:
|
| 119 |
+
buff.extend(pcm)
|
| 120 |
+
if len(buff) > (orca.sample_rate*2):
|
| 121 |
mp3_io = io.BytesIO()
|
| 122 |
sf.write(
|
| 123 |
+
mp3_io, np.asarray(buff[:orca.sample_rate]).astype(np.int16), orca.sample_rate, format="mp3"
|
| 124 |
)
|
| 125 |
audio_chunk = mp3_io.getvalue()
|
| 126 |
mp3_io.close()
|
| 127 |
+
buff = buff[orca.sample_rate:]
|
| 128 |
yield state, state.conversation, audio_chunk
|
| 129 |
|
| 130 |
del outs.logits
|
|
|
|
| 260 |
)
|
| 261 |
|
| 262 |
if __name__ == "__main__":
|
| 263 |
+
demo.launch(share=True)
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
transformers==4.43.3
|
| 2 |
-
gradio==5.0
|
| 3 |
spaces
|
| 4 |
accelerate
|
| 5 |
|
|
|
|
| 1 |
transformers==4.43.3
|
| 2 |
+
gradio==5.1.0
|
| 3 |
spaces
|
| 4 |
accelerate
|
| 5 |
|