diff --git a/Dockerfile b/Dockerfile index 3b3403f..b98f321 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,23 @@ FROM python:3.10-slim +ENV FLASK_APP=src/app.py +ARG PIP_REQ_FILE=requirements.txt + WORKDIR /app -COPY requirements.txt /app -RUN apt-get update && apt-get install git -y -RUN pip3 install -r requirements.txt -RUN pip3 install "git+https://github.com/openai/whisper.git" -RUN apt-get install -y ffmpeg +RUN apt update && apt install git ffmpeg -y && \ + pip3 install "git+https://github.com/openai/whisper.git" -RUN whisper --model medium --language ru dummy.wav; exit 0 -RUN whisper --model small --language ru dummy.wav; exit 0 +RUN whisper --model medium --language ru dummy.wav; exit 0 && \ + whisper --model small --language ru dummy.wav; exit 0 -COPY . . +COPY src/ src/ + +# Separate requirements installation to keep other dependencies +# in cache +COPY ${PIP_REQ_FILE} ${PIP_REQ_FILE} +RUN pip3 install -r ${PIP_REQ_FILE} EXPOSE 5000 -ENV FLASK_APP=src/app.py - -CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"] +CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"] \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..622219d --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ +run: + PYTORCH_NO_CUDA_MEMORY_CACHING=1 \ + PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \ + CUDA_LAUNCH_BLOCKING=1 \ + FLASK_APP=src/app.py \ + PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 \ + flask run --host=0.0.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a3e0ffb..46071bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,62 @@ -flask==3.0.2 -Jinja2==3.1.3 +asgiref==3.7.2 +av==11.0.0 blinker==1.7.0 -Werkzeug==3.0.1 +certifi==2024.2.2 +charset-normalizer==3.3.2 click==8.1.7 +coloredlogs==15.0.1 +ctranslate2==4.0.0 +Cython==3.0.8 +dtw-python==1.3.1 +faster-whisper==1.0.0 +filelock==3.13.1 +Flask==3.0.2 +flatbuffers==23.5.26 +fsspec==2024.2.0 +huggingface-hub==0.21.3 +humanfriendly==10.0 +idna==3.6 itsdangerous==2.1.2 +Jinja2==3.1.3 +llvmlite==0.42.0 MarkupSafe==2.1.5 +more-itertools==10.2.0 +mpmath==1.3.0 +networkx==3.2.1 +numba==0.59.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.19.3 +nvidia-nvjitlink-cu12==12.3.101 +nvidia-nvtx-cu12==12.1.105 +onnxruntime==1.17.1 +openai-whisper @ git+https://github.com/openai/whisper.git@ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab +packaging==23.2 +pillow==10.2.0 +protobuf==4.25.3 python-dotenv==1.0.1 +PyYAML==6.0.1 +regex==2023.12.25 +requests==2.31.0 +scipy==1.12.0 +six==1.16.0 +sympy==1.12 +tiktoken==0.6.0 +tokenizers==0.15.2 +torch==2.2.1 +torchaudio==2.2.1 +torchvision==0.17.1 +tqdm==4.66.2 +triton==2.2.0 +typing_extensions==4.10.0 +urllib3==2.2.1 +Werkzeug==3.0.1 +whisper-timestamped==1.15.0 \ No newline at end of file diff --git a/src/app.py b/src/app.py index 25ac26a..fc612fe 100644 --- a/src/app.py +++ b/src/app.py @@ -1,67 +1,59 @@ from flask import Flask, abort, request from tempfile import NamedTemporaryFile -from dotenv import load_dotenv -import os -import whisper -import torch + import sys -import re -load_dotenv() +import config -HARPYIA_PROMPT = os.getenv('HARPYIA_PROMPT') or 'спасите помогите на помощь пожар' -HARPYIA_MODEL = os.getenv('HARPYIA_MODEL') or 'medium' -HARPYIA_LANGUAGE = os.getenv('HARPYIA_LANGUAGE') or 'ru' +from queue_stack import QueueStack +from queue_stack.strategies import RecognizeAndSendStrategy -# Check if NVIDIA GPU is available -DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +from recognizer import Recognizer +from recognizer.strategies import WhisperStrategy, FastWhisperStrategy -# Load the Whisper model: -model = whisper.load_model(HARPYIA_MODEL, device=DEVICE) +from message import MessageService +from message.strategies import SosMessageStrategy, NumberMessageStrategy app = Flask(__name__) +whisper_recognizer = Recognizer(WhisperStrategy()) +fast_whisper_recognizer = Recognizer(FastWhisperStrategy()) + +sos_message_service = MessageService(SosMessageStrategy()) +number_message_service = MessageService(NumberMessageStrategy()) + +queue_stack = QueueStack(RecognizeAndSendStrategy()) +queue_stack.start_loop_in_thread() + @app.route("/") def hello(): - return "To recognize an audio file, upload it using a POST request with '/recognize' or '/recognize_number' route." + return "To recognize an audio file, upload it using a POST request with '/recognize' or '/recognize-number' route." -def recognize_files(handler_fn): - if not request.files: - abort(400) +def recognize_files(message_service: MessageService): + if not request.files: + abort(400) - results = [] + results = [] - for filename, handle in request.files.items(): - temp = NamedTemporaryFile() - handle.save(temp) - result = model.transcribe(temp.name, language=HARPYIA_LANGUAGE, initial_prompt=HARPYIA_PROMPT) - results.append({ - 'filename': filename, - 'transcript': handler_fn(result['text']), - }) + for filename, handle in request.files.items(): + temp = NamedTemporaryFile() + handle.save(temp) - print(results, file=sys.stderr) - return {'results': results} + results.append(queue_stack.append_and_await(( + temp, + whisper_recognizer, + message_service, + config.HARPYIA_LANGUAGE, + message_service.get_prompt() + ))) + + print(results, file=sys.stderr) + return {'results': results} @app.route('/recognize', methods=['POST']) def recognize(): - return recognize_files(lambda text: text) + return recognize_files(sos_message_service) -@app.route('/recognize_number', methods=['POST']) +@app.route('/recognize-number', methods=['POST']) def recognize_number(): - return recognize_files(transfer_and_clean) - -def transfer_and_clean(input_string): - number_mapping = { - "один": "1", - "два": "2", - "три": "3" - } - - for word, number in number_mapping.items(): - input_string = input_string.replace(word, number) - - input_string = re.sub(r'[^\d]+', '', input_string) - - return input_string - + return recognize_files(number_message_service) \ No newline at end of file diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..9f9cba9 --- /dev/null +++ b/src/config.py @@ -0,0 +1,21 @@ +import os +import torch +from dotenv import load_dotenv + +load_dotenv() + +HARPYIA_MODEL = os.getenv('HARPYIA_MODEL') or 'small' +HARPYIA_LANGUAGE = os.getenv('HARPYIA_LANGUAGE') or 'ru' +HARPYIA_SAMPLE_RATE = os.getenv('HARPYIA_SAMPLE_RATE') or 160000 + +WHISPER_NUM_WORKERS = os.getenv('WHISPER_NUM_WORKERS') or 6 +WHISPER_CPU_THREADS = os.getenv('WHISPER_CPU_THREADS') or 10 +WHISPER_BEAM_SIZE = os.getenv('WHISPER_BEAM_SIZE') or 5 + +SOS_PROMPT = os.getenv('SOS_PROMPT') or 'спасите помогите помощь пожар караул кирилл' +NUMBER_PROMPT = os.getenv('NUMBER_PROMPT') or 'один два три четыре пять шесть семь восемь девять десять одинадцать двенадцать тринадцать сто сот' + +RAT_URL = os.getenv('RAT_URL') or 'localhost:8081' + +# Check if NVIDIA GPU is available +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" \ No newline at end of file diff --git a/src/message/__init__.py b/src/message/__init__.py new file mode 100644 index 0000000..92b98bd --- /dev/null +++ b/src/message/__init__.py @@ -0,0 +1 @@ +from message.message_service import MessageService \ No newline at end of file diff --git a/src/message/message_service.py b/src/message/message_service.py new file mode 100644 index 0000000..c78e8ea --- /dev/null +++ b/src/message/message_service.py @@ -0,0 +1,24 @@ +import sys +from message.strategies import BaseMessageStrategy + +class MessageService: + def __init__(self, strategy: BaseMessageStrategy) -> None: + self._strategy = strategy + + def get_prompt(self) -> str: + self._strategy.get_prompt() + + def transfer(self, text: str) -> any: + return self._strategy.transfer(text) + + def send(self, message: str) -> any: + self._strategy.send(message) + + def transfer_and_send(self, recognized_result: any) -> any: + message = self.transfer(recognized_result) + + if message: + self.send(message) + + print('Sending message:', recognized_result, file=sys.stderr) + return message \ No newline at end of file diff --git a/src/message/strategies/__init__.py b/src/message/strategies/__init__.py new file mode 100644 index 0000000..8991a4f --- /dev/null +++ b/src/message/strategies/__init__.py @@ -0,0 +1,3 @@ +from message.strategies.base_message_strategy import BaseMessageStrategy +from message.strategies.sos_message_strategy import SosMessageStrategy +from message.strategies.number_message_strategy import NumberMessageStrategy \ No newline at end of file diff --git a/src/message/strategies/base_message_strategy.py b/src/message/strategies/base_message_strategy.py new file mode 100644 index 0000000..fd45ed0 --- /dev/null +++ b/src/message/strategies/base_message_strategy.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + +class BaseMessageStrategy(ABC): + @abstractmethod + def get_prompt() -> str: + pass + + @abstractmethod + def transfer(self, text: str) -> any: + pass + + @abstractmethod + def send(self, message: str) -> any: + pass \ No newline at end of file diff --git a/src/message/strategies/number_message_strategy.py b/src/message/strategies/number_message_strategy.py new file mode 100644 index 0000000..054bd2d --- /dev/null +++ b/src/message/strategies/number_message_strategy.py @@ -0,0 +1,31 @@ +import re + +import config +from message.strategies import BaseMessageStrategy + +class NumberMessageStrategy(BaseMessageStrategy): + def __init__(self, prompt=config.NUMBER_PROMPT) -> None: + self._prompt = prompt + + def get_prompt(self): + return self._prompt + + def transfer(self, recognized_result: any) -> str: + return self._transfer_and_clean(recognized_result['text']) + + def _transfer_and_clean(self, text: str) -> str: + number_mapping = { + "один": "1", + "два": "2", + "три": "3" + } + + for word, number in number_mapping.items(): + transfered_text = text.replace(word, number) + + transfered_text = re.sub(r'[^\d]+', '', transfered_text) + + return {'recognized': transfered_text} + + def send(self, message: str) -> None: + pass \ No newline at end of file diff --git a/src/message/strategies/sos_message_strategy.py b/src/message/strategies/sos_message_strategy.py new file mode 100644 index 0000000..39dee28 --- /dev/null +++ b/src/message/strategies/sos_message_strategy.py @@ -0,0 +1,35 @@ +from typing import List +import requests + +import config +from message.strategies import BaseMessageStrategy + +MESSAGE_ENDPOINT = '/message' + +class SosMessageStrategy(BaseMessageStrategy): + def __init__(self, prompt=config.SOS_PROMPT, url=config.RAT_URL) -> None: + self._prompt = prompt + self._url = url + + def get_prompt(self): + return self._prompt + + def transfer(self, recognized_result: any) -> str: + return { + 'transcript': recognized_result['text'], + 'results': self._filter_words_with_prompt(recognized_result['text']), + 'segments': recognized_result['segments'] + } + + def _filter_words_with_prompt(self, text: str) -> str: + words = [] + + for prompt in self._prompt.split(' '): + if prompt in text.lower(): + words.append(prompt) + + return words + + def send(self, message) -> any: + pass + #return requests.post(self._url + MESSAGE_ENDPOINT, json={'message': message}) diff --git a/src/queue_stack/__init__.py b/src/queue_stack/__init__.py new file mode 100644 index 0000000..b065d1b --- /dev/null +++ b/src/queue_stack/__init__.py @@ -0,0 +1 @@ +from queue_stack.queue_stack import QueueStack diff --git a/src/queue_stack/queue_stack.py b/src/queue_stack/queue_stack.py new file mode 100644 index 0000000..6c38e20 --- /dev/null +++ b/src/queue_stack/queue_stack.py @@ -0,0 +1,51 @@ +import sys + +from threading import Thread, Event, Lock + +from queue_stack.strategies import BaseProcessStrategy + +class QueueStack: + def __init__(self, strategy: BaseProcessStrategy) -> None: + self._stack = [] + self._strategy = strategy + + self._lock = Lock() + self._running = False + + self._last_response = None + + def append(self, args, event=None) -> None: + with self._lock: + self._stack.append((args, event)) + + def append_and_await(self, args) -> any: + event = Event() + self.append(args, event=event) + + event.wait() + event.clear() + + return self._last_response + + def loop(self) -> None: + self._running = True + + while self._running: + with self._lock: + if self._stack: + print('Stack length:', len(self._stack), file=sys.stderr) + (args, event) = self._stack.pop(0) + self._last_response = self._process(*args) + + if event: + event.set() + + def _process(self, *args, **kwargs) -> any: + return self._strategy.process(*args, **kwargs) + + def start_loop_in_thread(self) -> None: + thread = Thread(target=self.loop) + thread.start() + + def stop_loop(self) -> None: + self._running = False \ No newline at end of file diff --git a/src/queue_stack/strategies/__init__.py b/src/queue_stack/strategies/__init__.py new file mode 100644 index 0000000..e274e9b --- /dev/null +++ b/src/queue_stack/strategies/__init__.py @@ -0,0 +1,2 @@ +from queue_stack.strategies.base_process_strategy import BaseProcessStrategy +from queue_stack.strategies.recognize_and_send_strategy import RecognizeAndSendStrategy \ No newline at end of file diff --git a/src/queue_stack/strategies/base_process_strategy.py b/src/queue_stack/strategies/base_process_strategy.py new file mode 100644 index 0000000..00d7792 --- /dev/null +++ b/src/queue_stack/strategies/base_process_strategy.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + +class BaseProcessStrategy(ABC): + @abstractmethod + def process(self, *args, **kwargs) -> any: + pass diff --git a/src/queue_stack/strategies/recognize_and_send_strategy.py b/src/queue_stack/strategies/recognize_and_send_strategy.py new file mode 100644 index 0000000..1269ff5 --- /dev/null +++ b/src/queue_stack/strategies/recognize_and_send_strategy.py @@ -0,0 +1,14 @@ +import sys + +from queue_stack.strategies import BaseProcessStrategy +from message import MessageService +from recognizer import Recognizer + +class RecognizeAndSendStrategy(BaseProcessStrategy): + def process(self, file, recognizer: Recognizer, message_service: MessageService, language, prompt) -> any: + + result = recognizer.recognize(file, language=language, prompt=prompt) + message = message_service.transfer_and_send(result) + print(message, file=sys.stderr) + + return message \ No newline at end of file diff --git a/src/recognizer/__init__.py b/src/recognizer/__init__.py new file mode 100644 index 0000000..402b39d --- /dev/null +++ b/src/recognizer/__init__.py @@ -0,0 +1 @@ +from recognizer.recognizer import Recognizer \ No newline at end of file diff --git a/src/recognizer/recognizer.py b/src/recognizer/recognizer.py new file mode 100644 index 0000000..d78135d --- /dev/null +++ b/src/recognizer/recognizer.py @@ -0,0 +1,14 @@ +import sys + +import config +from recognizer.strategies import BaseRecognizerStrategy + +class Recognizer: + def __init__(self, strategy: BaseRecognizerStrategy) -> None: + self._strategy = strategy + + def recognize(self, file, language, prompt) -> str: + result = self._strategy.recognize(file, language=language, prompt=prompt) + + print(f'Result: {result}', file=sys.stderr) + return result \ No newline at end of file diff --git a/src/recognizer/strategies/__init__.py b/src/recognizer/strategies/__init__.py new file mode 100644 index 0000000..d73c755 --- /dev/null +++ b/src/recognizer/strategies/__init__.py @@ -0,0 +1,3 @@ +from recognizer.strategies.base_recognizer_strategy import BaseRecognizerStrategy +from recognizer.strategies.whisper_strategy import WhisperStrategy +from recognizer.strategies.fast_whisper_strategy import FastWhisperStrategy diff --git a/src/recognizer/strategies/base_recognizer_strategy.py b/src/recognizer/strategies/base_recognizer_strategy.py new file mode 100644 index 0000000..0bdd03a --- /dev/null +++ b/src/recognizer/strategies/base_recognizer_strategy.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + +class BaseRecognizerStrategy(ABC): + @abstractmethod + def recognize(self, file, language, prompt) -> any: + pass diff --git a/src/recognizer/strategies/fast_whisper_strategy.py b/src/recognizer/strategies/fast_whisper_strategy.py new file mode 100644 index 0000000..5d0cdac --- /dev/null +++ b/src/recognizer/strategies/fast_whisper_strategy.py @@ -0,0 +1,59 @@ +import sys + +import whisper +from faster_whisper import WhisperModel + +import config +from recognizer.strategies import BaseRecognizerStrategy + +class FastWhisperStrategy(BaseRecognizerStrategy): + def __init__(self) -> None: + self._model = WhisperModel( + model_size_or_path=config.HARPYIA_MODEL, + device=config.DEVICE, + num_workers=config.WHISPER_NUM_WORKERS, + cpu_threads=config.WHISPER_CPU_THREADS + ) + + def recognize(self, file, language, prompt) -> any: + audio = self._prepare_file(file.name) + return self._transcribe(audio, language, prompt) + + def _prepare_file(self, filename: str): + audio = whisper.load_audio(filename, sr=config.HARPYIA_SAMPLE_RATE) + audio = whisper.pad_or_trim(audio) + return audio + + def _transcribe(self, audio, language, prompt): + segments, _ = self._model.transcribe( + audio, + language=language, + initial_prompt=prompt, + condition_on_previous_text=False, + vad_filter=True, + beam_size=config.WHISPER_BEAM_SIZE, + ) + + print('Segments:', file=sys.stderr) + for i in segments: + print(i, file=sys.stderr) + + words = [] + for segment in list(segments): + words.append(segment.text) + + return { + 'text': ' '.join(words), + 'segments': { + 'id': None, + 'seek': None, + 'start': None, + 'end': None, + 'text': None, + 'tokens': None, + 'temperature': None, + 'avg_logprob': None, + 'compression_ratio': None, + 'no_speech_prob': None, + } + } \ No newline at end of file diff --git a/src/recognizer/strategies/whisper_strategy.py b/src/recognizer/strategies/whisper_strategy.py new file mode 100644 index 0000000..05cdf27 --- /dev/null +++ b/src/recognizer/strategies/whisper_strategy.py @@ -0,0 +1,12 @@ +import whisper + +import config +from recognizer.strategies import BaseRecognizerStrategy + +class WhisperStrategy(BaseRecognizerStrategy): + def __init__(self) -> None: + self._model = whisper.load_model(config.HARPYIA_MODEL, device=config.DEVICE) + + def recognize(self, file, language, prompt) -> any: + return self._model.transcribe(file.name, \ + language=language, initial_prompt=prompt) \ No newline at end of file