import sys import whisper from faster_whisper import WhisperModel import config from recognizer.strategies import BaseRecognizerStrategy class FastWhisperStrategy(BaseRecognizerStrategy): def __init__(self) -> None: self._model = WhisperModel( model_size_or_path=config.HARPYIA_MODEL, device=config.DEVICE, num_workers=config.WHISPER_NUM_WORKERS, cpu_threads=config.WHISPER_CPU_THREADS ) def recognize(self, file, language, prompt) -> any: audio = self._prepare_file(file.name) return self._transcribe(audio, language, prompt) def _prepare_file(self, filename: str): audio = whisper.load_audio(filename, sr=config.HARPYIA_SAMPLE_RATE) audio = whisper.pad_or_trim(audio) return audio def _transcribe(self, audio, language, prompt): segments, _ = self._model.transcribe( audio, language=language, initial_prompt=prompt, condition_on_previous_text=False, vad_filter=True, beam_size=config.WHISPER_BEAM_SIZE, ) print('Segments:', file=sys.stderr) for i in segments: print(i, file=sys.stderr) words = [] for segment in list(segments): words.append(segment.text) return { 'text': ' '.join(words), 'segments': { 'id': None, 'seek': None, 'start': None, 'end': None, 'text': None, 'tokens': None, 'temperature': None, 'avg_logprob': None, 'compression_ratio': None, 'no_speech_prob': None, } }