Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| from transformers import pipeline | |
| import torch | |
| import os | |
| import spaces | |
| # Load the model pipeline | |
| pipe = pipeline( | |
| task="multimodal_mt", | |
| model="BSC-LT/salamandra-TAV-7b", | |
| trust_remote_code=True, | |
| token=os.environ.get("HF_TOKEN"), | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| ) | |
| # Define the languages for the dropdowns | |
| LANGUAGES = { | |
| "autodetect": "Autodetect", | |
| "en": "English", | |
| "es": "Spanish", | |
| "ca": "Catalan", | |
| "pt": "Portuguese", | |
| "gl": "Galician", | |
| "eu": "Basque", | |
| } | |
| # Invert the dictionary for easy lookup | |
| LANG_TO_NAME = {v: k for k, v in LANGUAGES.items()} | |
| def process_audio(audio, source_lang_name, target_lang_name): | |
| """ | |
| Processes the audio input to perform speech-to-text translation or transcription. | |
| """ | |
| if audio is None: | |
| return "Please provide an audio file or record one.", "" | |
| if target_lang_name is None: | |
| return "Please select a target language.", "" | |
| source_lang = LANG_TO_NAME.get(source_lang_name) | |
| target_lang = LANG_TO_NAME.get(target_lang_name) | |
| generation_kwargs = {"beam_size": 5, "max_new_tokens": 100} | |
| asr_kwargs = {"mode": "asr", "return_chat_history": True, **generation_kwargs} | |
| if source_lang != "autodetect": | |
| asr_kwargs["src_lang"] = source_lang_name | |
| history = pipe(audio, **asr_kwargs) | |
| # If source and target languages are the same, we're done (transcription) | |
| if source_lang == target_lang: | |
| text = history.get_assistant_messages()[-1] | |
| else: | |
| # Text-to-text translation step | |
| t2tt_kwargs = { | |
| "mode": "t2tt", | |
| "tgt_lang": target_lang_name, | |
| "return_chat_history": True, | |
| **generation_kwargs | |
| } | |
| if source_lang != "autodetect": | |
| t2tt_kwargs["src_lang"] = source_lang_name | |
| history = pipe(history, **t2tt_kwargs) | |
| text = history.get_assistant_messages()[-1] | |
| detected_language = "" | |
| if source_lang == "autodetect": | |
| # Language identification step | |
| lang_history = pipe(history, mode="lid", return_chat_history=True, **generation_kwargs) | |
| detected_language = lang_history.get_assistant_messages()[-1] | |
| return text, detected_language | |
| # Create the Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# SalamandraTAV: Speech-to-Text Translation Demo") | |
| gr.Markdown( | |
| "A multilingual model for Speech-to-Text Translation (S2TT) and Automatic Speech Recognition (ASR) for Iberian languages." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input") | |
| with gr.Row(): | |
| source_lang_dropdown = gr.Dropdown( | |
| choices=list(LANGUAGES.values()), | |
| value=LANGUAGES["autodetect"], | |
| label="Source Language (Optional)", | |
| ) | |
| target_lang_dropdown = gr.Dropdown( | |
| choices=[lang for key, lang in LANGUAGES.items() if key != "autodetect"], | |
| label="Target Language (Required)", | |
| ) | |
| submit_button = gr.Button("Translate/Transcribe") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Output", lines=10, interactive=False) | |
| detected_lang_output = gr.Textbox(label="Detected Source Language", interactive=False) | |
| submit_button.click( | |
| fn=process_audio, | |
| inputs=[audio_input, source_lang_dropdown, target_lang_dropdown], | |
| outputs=[output_text, detected_lang_output], | |
| ) | |
| gr.Markdown("## Examples") | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/orig/127389__acclivity__thetimehascome.wav", | |
| LANGUAGES["en"], | |
| LANGUAGES["es"], | |
| ], | |
| [ | |
| "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/orig/127389__acclivity__thetimehascome.wav", | |
| LANGUAGES["en"], | |
| LANGUAGES["en"], | |
| ], | |
| ], | |
| inputs=[audio_input, source_lang_dropdown, target_lang_dropdown], | |
| outputs=[output_text, detected_lang_output], | |
| fn=process_audio, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |