Compare commits

..

4 Commits

Author SHA1 Message Date
70e6e6ca90 Merge pull request 'feature/file-stack' (#1) from feature/file-stack into main
Reviewed-on: #1
2024-03-22 16:00:50 +00:00
37adf74745 Implemented strategies for sos and number recognition
Deleted message sender and promt service

Implemented fast whisper, but it is not working

WavStack refactored into QueueStack, which can use different strategies for proccessing
2024-03-22 18:59:42 +03:00
dbbf845e56 Added whisper and fast whisper implementation 2024-03-20 12:51:14 +03:00
e89122cb76 Implemented new architecture
Created message service responsible for searching the prompts inside the recognized text and sending it to the client.

Created recognizer with two strategies: whisper and Dany's fast whisper.

Implemented file stack which works in the separated thread, sends the file to the recognizer and after that sends the message to the client (Rat, for example).
2024-03-19 19:01:36 +03:00
22 changed files with 414 additions and 60 deletions

View File

@ -1,20 +1,23 @@
FROM python:3.10-slim FROM python:3.10-slim
ENV FLASK_APP=src/app.py
ARG PIP_REQ_FILE=requirements.txt
WORKDIR /app WORKDIR /app
COPY requirements.txt /app RUN apt update && apt install git ffmpeg -y && \
RUN apt-get update && apt-get install git -y pip3 install "git+https://github.com/openai/whisper.git"
RUN pip3 install -r requirements.txt
RUN pip3 install "git+https://github.com/openai/whisper.git"
RUN apt-get install -y ffmpeg
RUN whisper --model medium --language ru dummy.wav; exit 0 RUN whisper --model medium --language ru dummy.wav; exit 0 && \
RUN whisper --model small --language ru dummy.wav; exit 0 whisper --model small --language ru dummy.wav; exit 0
COPY . . COPY src/ src/
# Separate requirements installation to keep other dependencies
# in cache
COPY ${PIP_REQ_FILE} ${PIP_REQ_FILE}
RUN pip3 install -r ${PIP_REQ_FILE}
EXPOSE 5000 EXPOSE 5000
ENV FLASK_APP=src/app.py
CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"] CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"]

7
Makefile Normal file
View File

@ -0,0 +1,7 @@
run:
PYTORCH_NO_CUDA_MEMORY_CACHING=1 \
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
CUDA_LAUNCH_BLOCKING=1 \
FLASK_APP=src/app.py \
PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 \
flask run --host=0.0.0.0

View File

@ -1,8 +1,62 @@
flask==3.0.2 asgiref==3.7.2
Jinja2==3.1.3 av==11.0.0
blinker==1.7.0 blinker==1.7.0
Werkzeug==3.0.1 certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7 click==8.1.7
coloredlogs==15.0.1
ctranslate2==4.0.0
Cython==3.0.8
dtw-python==1.3.1
faster-whisper==1.0.0
filelock==3.13.1
Flask==3.0.2
flatbuffers==23.5.26
fsspec==2024.2.0
huggingface-hub==0.21.3
humanfriendly==10.0
idna==3.6
itsdangerous==2.1.2 itsdangerous==2.1.2
Jinja2==3.1.3
llvmlite==0.42.0
MarkupSafe==2.1.5 MarkupSafe==2.1.5
more-itertools==10.2.0
mpmath==1.3.0
networkx==3.2.1
numba==0.59.0
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.19.3
nvidia-nvjitlink-cu12==12.3.101
nvidia-nvtx-cu12==12.1.105
onnxruntime==1.17.1
openai-whisper @ git+https://github.com/openai/whisper.git@ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
packaging==23.2
pillow==10.2.0
protobuf==4.25.3
python-dotenv==1.0.1 python-dotenv==1.0.1
PyYAML==6.0.1
regex==2023.12.25
requests==2.31.0
scipy==1.12.0
six==1.16.0
sympy==1.12
tiktoken==0.6.0
tokenizers==0.15.2
torch==2.2.1
torchaudio==2.2.1
torchvision==0.17.1
tqdm==4.66.2
triton==2.2.0
typing_extensions==4.10.0
urllib3==2.2.1
Werkzeug==3.0.1
whisper-timestamped==1.15.0

View File

@ -1,31 +1,35 @@
from flask import Flask, abort, request from flask import Flask, abort, request
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
from dotenv import load_dotenv
import os
import whisper
import torch
import sys import sys
import re
load_dotenv() import config
HARPYIA_PROMPT = os.getenv('HARPYIA_PROMPT') or 'спасите помогите на помощь пожар' from queue_stack import QueueStack
HARPYIA_MODEL = os.getenv('HARPYIA_MODEL') or 'medium' from queue_stack.strategies import RecognizeAndSendStrategy
HARPYIA_LANGUAGE = os.getenv('HARPYIA_LANGUAGE') or 'ru'
# Check if NVIDIA GPU is available from recognizer import Recognizer
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" from recognizer.strategies import WhisperStrategy, FastWhisperStrategy
# Load the Whisper model: from message import MessageService
model = whisper.load_model(HARPYIA_MODEL, device=DEVICE) from message.strategies import SosMessageStrategy, NumberMessageStrategy
app = Flask(__name__) app = Flask(__name__)
whisper_recognizer = Recognizer(WhisperStrategy())
fast_whisper_recognizer = Recognizer(FastWhisperStrategy())
sos_message_service = MessageService(SosMessageStrategy())
number_message_service = MessageService(NumberMessageStrategy())
queue_stack = QueueStack(RecognizeAndSendStrategy())
queue_stack.start_loop_in_thread()
@app.route("/") @app.route("/")
def hello(): def hello():
return "To recognize an audio file, upload it using a POST request with '/recognize' or '/recognize_number' route." return "To recognize an audio file, upload it using a POST request with '/recognize' or '/recognize-number' route."
def recognize_files(handler_fn): def recognize_files(message_service: MessageService):
if not request.files: if not request.files:
abort(400) abort(400)
@ -34,34 +38,22 @@ def recognize_files(handler_fn):
for filename, handle in request.files.items(): for filename, handle in request.files.items():
temp = NamedTemporaryFile() temp = NamedTemporaryFile()
handle.save(temp) handle.save(temp)
result = model.transcribe(temp.name, language=HARPYIA_LANGUAGE, initial_prompt=HARPYIA_PROMPT)
results.append({ results.append(queue_stack.append_and_await((
'filename': filename, temp,
'transcript': handler_fn(result['text']), whisper_recognizer,
}) message_service,
config.HARPYIA_LANGUAGE,
message_service.get_prompt()
)))
print(results, file=sys.stderr) print(results, file=sys.stderr)
return {'results': results} return {'results': results}
@app.route('/recognize', methods=['POST']) @app.route('/recognize', methods=['POST'])
def recognize(): def recognize():
return recognize_files(lambda text: text) return recognize_files(sos_message_service)
@app.route('/recognize_number', methods=['POST']) @app.route('/recognize-number', methods=['POST'])
def recognize_number(): def recognize_number():
return recognize_files(transfer_and_clean) return recognize_files(number_message_service)
def transfer_and_clean(input_string):
number_mapping = {
"один": "1",
"два": "2",
"три": "3"
}
for word, number in number_mapping.items():
input_string = input_string.replace(word, number)
input_string = re.sub(r'[^\d]+', '', input_string)
return input_string

21
src/config.py Normal file
View File

@ -0,0 +1,21 @@
import os
import torch
from dotenv import load_dotenv
load_dotenv()
HARPYIA_MODEL = os.getenv('HARPYIA_MODEL') or 'small'
HARPYIA_LANGUAGE = os.getenv('HARPYIA_LANGUAGE') or 'ru'
HARPYIA_SAMPLE_RATE = os.getenv('HARPYIA_SAMPLE_RATE') or 160000
WHISPER_NUM_WORKERS = os.getenv('WHISPER_NUM_WORKERS') or 6
WHISPER_CPU_THREADS = os.getenv('WHISPER_CPU_THREADS') or 10
WHISPER_BEAM_SIZE = os.getenv('WHISPER_BEAM_SIZE') or 5
SOS_PROMPT = os.getenv('SOS_PROMPT') or 'спасите помогите помощь пожар караул кирилл'
NUMBER_PROMPT = os.getenv('NUMBER_PROMPT') or 'один два три четыре пять шесть семь восемь девять десять одинадцать двенадцать тринадцать сто сот'
RAT_URL = os.getenv('RAT_URL') or 'localhost:8081'
# Check if NVIDIA GPU is available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

1
src/message/__init__.py Normal file
View File

@ -0,0 +1 @@
from message.message_service import MessageService

View File

@ -0,0 +1,24 @@
import sys
from message.strategies import BaseMessageStrategy
class MessageService:
def __init__(self, strategy: BaseMessageStrategy) -> None:
self._strategy = strategy
def get_prompt(self) -> str:
self._strategy.get_prompt()
def transfer(self, text: str) -> any:
return self._strategy.transfer(text)
def send(self, message: str) -> any:
self._strategy.send(message)
def transfer_and_send(self, recognized_result: any) -> any:
message = self.transfer(recognized_result)
if message:
self.send(message)
print('Sending message:', recognized_result, file=sys.stderr)
return message

View File

@ -0,0 +1,3 @@
from message.strategies.base_message_strategy import BaseMessageStrategy
from message.strategies.sos_message_strategy import SosMessageStrategy
from message.strategies.number_message_strategy import NumberMessageStrategy

View File

@ -0,0 +1,14 @@
from abc import ABC, abstractmethod
class BaseMessageStrategy(ABC):
@abstractmethod
def get_prompt() -> str:
pass
@abstractmethod
def transfer(self, text: str) -> any:
pass
@abstractmethod
def send(self, message: str) -> any:
pass

View File

@ -0,0 +1,31 @@
import re
import config
from message.strategies import BaseMessageStrategy
class NumberMessageStrategy(BaseMessageStrategy):
def __init__(self, prompt=config.NUMBER_PROMPT) -> None:
self._prompt = prompt
def get_prompt(self):
return self._prompt
def transfer(self, recognized_result: any) -> str:
return self._transfer_and_clean(recognized_result['text'])
def _transfer_and_clean(self, text: str) -> str:
number_mapping = {
"один": "1",
"два": "2",
"три": "3"
}
for word, number in number_mapping.items():
transfered_text = text.replace(word, number)
transfered_text = re.sub(r'[^\d]+', '', transfered_text)
return {'recognized': transfered_text}
def send(self, message: str) -> None:
pass

View File

@ -0,0 +1,35 @@
from typing import List
import requests
import config
from message.strategies import BaseMessageStrategy
MESSAGE_ENDPOINT = '/message'
class SosMessageStrategy(BaseMessageStrategy):
def __init__(self, prompt=config.SOS_PROMPT, url=config.RAT_URL) -> None:
self._prompt = prompt
self._url = url
def get_prompt(self):
return self._prompt
def transfer(self, recognized_result: any) -> str:
return {
'transcript': recognized_result['text'],
'results': self._filter_words_with_prompt(recognized_result['text']),
'segments': recognized_result['segments']
}
def _filter_words_with_prompt(self, text: str) -> str:
words = []
for prompt in self._prompt.split(' '):
if prompt in text.lower():
words.append(prompt)
return words
def send(self, message) -> any:
pass
#return requests.post(self._url + MESSAGE_ENDPOINT, json={'message': message})

View File

@ -0,0 +1 @@
from queue_stack.queue_stack import QueueStack

View File

@ -0,0 +1,51 @@
import sys
from threading import Thread, Event, Lock
from queue_stack.strategies import BaseProcessStrategy
class QueueStack:
def __init__(self, strategy: BaseProcessStrategy) -> None:
self._stack = []
self._strategy = strategy
self._lock = Lock()
self._running = False
self._last_response = None
def append(self, args, event=None) -> None:
with self._lock:
self._stack.append((args, event))
def append_and_await(self, args) -> any:
event = Event()
self.append(args, event=event)
event.wait()
event.clear()
return self._last_response
def loop(self) -> None:
self._running = True
while self._running:
with self._lock:
if self._stack:
print('Stack length:', len(self._stack), file=sys.stderr)
(args, event) = self._stack.pop(0)
self._last_response = self._process(*args)
if event:
event.set()
def _process(self, *args, **kwargs) -> any:
return self._strategy.process(*args, **kwargs)
def start_loop_in_thread(self) -> None:
thread = Thread(target=self.loop)
thread.start()
def stop_loop(self) -> None:
self._running = False

View File

@ -0,0 +1,2 @@
from queue_stack.strategies.base_process_strategy import BaseProcessStrategy
from queue_stack.strategies.recognize_and_send_strategy import RecognizeAndSendStrategy

View File

@ -0,0 +1,6 @@
from abc import ABC, abstractmethod
class BaseProcessStrategy(ABC):
@abstractmethod
def process(self, *args, **kwargs) -> any:
pass

View File

@ -0,0 +1,14 @@
import sys
from queue_stack.strategies import BaseProcessStrategy
from message import MessageService
from recognizer import Recognizer
class RecognizeAndSendStrategy(BaseProcessStrategy):
def process(self, file, recognizer: Recognizer, message_service: MessageService, language, prompt) -> any:
result = recognizer.recognize(file, language=language, prompt=prompt)
message = message_service.transfer_and_send(result)
print(message, file=sys.stderr)
return message

View File

@ -0,0 +1 @@
from recognizer.recognizer import Recognizer

View File

@ -0,0 +1,14 @@
import sys
import config
from recognizer.strategies import BaseRecognizerStrategy
class Recognizer:
def __init__(self, strategy: BaseRecognizerStrategy) -> None:
self._strategy = strategy
def recognize(self, file, language, prompt) -> str:
result = self._strategy.recognize(file, language=language, prompt=prompt)
print(f'Result: {result}', file=sys.stderr)
return result

View File

@ -0,0 +1,3 @@
from recognizer.strategies.base_recognizer_strategy import BaseRecognizerStrategy
from recognizer.strategies.whisper_strategy import WhisperStrategy
from recognizer.strategies.fast_whisper_strategy import FastWhisperStrategy

View File

@ -0,0 +1,6 @@
from abc import ABC, abstractmethod
class BaseRecognizerStrategy(ABC):
@abstractmethod
def recognize(self, file, language, prompt) -> any:
pass

View File

@ -0,0 +1,59 @@
import sys
import whisper
from faster_whisper import WhisperModel
import config
from recognizer.strategies import BaseRecognizerStrategy
class FastWhisperStrategy(BaseRecognizerStrategy):
def __init__(self) -> None:
self._model = WhisperModel(
model_size_or_path=config.HARPYIA_MODEL,
device=config.DEVICE,
num_workers=config.WHISPER_NUM_WORKERS,
cpu_threads=config.WHISPER_CPU_THREADS
)
def recognize(self, file, language, prompt) -> any:
audio = self._prepare_file(file.name)
return self._transcribe(audio, language, prompt)
def _prepare_file(self, filename: str):
audio = whisper.load_audio(filename, sr=config.HARPYIA_SAMPLE_RATE)
audio = whisper.pad_or_trim(audio)
return audio
def _transcribe(self, audio, language, prompt):
segments, _ = self._model.transcribe(
audio,
language=language,
initial_prompt=prompt,
condition_on_previous_text=False,
vad_filter=True,
beam_size=config.WHISPER_BEAM_SIZE,
)
print('Segments:', file=sys.stderr)
for i in segments:
print(i, file=sys.stderr)
words = []
for segment in list(segments):
words.append(segment.text)
return {
'text': ' '.join(words),
'segments': {
'id': None,
'seek': None,
'start': None,
'end': None,
'text': None,
'tokens': None,
'temperature': None,
'avg_logprob': None,
'compression_ratio': None,
'no_speech_prob': None,
}
}

View File

@ -0,0 +1,12 @@
import whisper
import config
from recognizer.strategies import BaseRecognizerStrategy
class WhisperStrategy(BaseRecognizerStrategy):
def __init__(self) -> None:
self._model = whisper.load_model(config.HARPYIA_MODEL, device=config.DEVICE)
def recognize(self, file, language, prompt) -> any:
return self._model.transcribe(file.name, \
language=language, initial_prompt=prompt)