From ca43cd9d08987dd8e0589aeed392828deb41fc38 Mon Sep 17 00:00:00 2001 From: Quentin Fahrner Date: Mon, 1 Sep 2025 16:23:21 +0200 Subject: [PATCH 1/2] #340: incomplete transcription of non english audios In a French audio file transcription, WhisperX engine with the large-v3 model is only returning a partial transcription. The beginning and the end is missing. When using OpenAI Whisper engine, the problem is not visible. See https://github.com/m-bain/whisperX/issues/764 --- app/asr_models/mbain_whisperx_engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/app/asr_models/mbain_whisperx_engine.py b/app/asr_models/mbain_whisperx_engine.py index c39b9df..4a1634b 100644 --- a/app/asr_models/mbain_whisperx_engine.py +++ b/app/asr_models/mbain_whisperx_engine.py @@ -1,4 +1,4 @@ -import time +qqimport time from io import StringIO from threading import Thread from typing import BinaryIO, Union @@ -23,11 +23,13 @@ def __init__(self): def load_model(self): asr_options = {"without_timestamps": False} + vad_options = {"vad_onset": 0.1, "vad_offset": 0.1} self.model['whisperx'] = whisperx.load_model( CONFIG.MODEL_NAME, device=CONFIG.DEVICE, compute_type=CONFIG.MODEL_QUANTIZATION, - asr_options=asr_options + asr_options=asr_options, + vad_options=vad_options ) if CONFIG.HF_TOKEN != "": From d3b3bc64da2d457a338c6d29bb984588b4d2cf5e Mon Sep 17 00:00:00 2001 From: Quentin Fahrner Date: Mon, 1 Sep 2025 16:24:24 +0200 Subject: [PATCH 2/2] #340: incomplete transcription of non english audios --- app/asr_models/mbain_whisperx_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/asr_models/mbain_whisperx_engine.py b/app/asr_models/mbain_whisperx_engine.py index 4a1634b..b87faab 100644 --- a/app/asr_models/mbain_whisperx_engine.py +++ b/app/asr_models/mbain_whisperx_engine.py @@ -1,4 +1,4 @@ -qqimport time +import time from io import StringIO from threading import Thread from typing import BinaryIO, Union