diff --git a/.gitignore b/.gitignore index 27a421d..d70d369 100644 --- a/.gitignore +++ b/.gitignore @@ -99,7 +99,7 @@ ipython_config.py # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -poetry.lock +**poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. @@ -107,7 +107,7 @@ poetry.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide -.toml +**.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ @@ -159,9 +159,9 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +**.pt +**.log +**.wav +**.mp3 +**.m4a **.pass -**..pt -**..log -**..wav -**..mp3 -**..m4a diff --git a/Dockerfile b/Dockerfile index cf27870..10cad6a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,9 +4,9 @@ WORKDIR /app COPY requirements.txt /app/ -RUN apt-get update && apt-get install -y git -RUN pip install \ +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/list/* +RUN pip3 install \ git+https://github.com/linto-ai/whisper-timestamped.git#egg=whisper-timestamped[dev,vad_silero,vad_auditok,test] \ -r requirements.txt -COPY transcribe.py /app/ \ No newline at end of file +COPY transcribe.py /app/ diff --git a/client/client.py b/client/client.py deleted file mode 100644 index 2b0d4d9..0000000 --- a/client/client.py +++ /dev/null @@ -1,30 +0,0 @@ -import argparse -import os -import requests - -def upload_file(file_path: str, use_parallel: bool): - if not os.path.isfile(file_path): - print("The file does not exist.") - return - - url = "http://localhost:8000/transcribe/parallel" if use_parallel else "http://localhost:8000/transcribe/single" - files = {'files': (os.path.basename(file_path), open(file_path, 'rb'), 'audio/wav')} - - try: - response = requests.post(url, files=files) - response.raise_for_status() - print("Response:", response.json()) - except requests.exceptions.HTTPError as err: - print(f"HTTP Error: {err}") - except Exception as err: - print(f"Error: {err}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Upload an audio file to the FastAPI server for transcription.") - parser.add_argument('--file', type=str, required=True, help='File path of the audio file') - parser.add_argument('--parallel', action='store_true', help='Use parallel transcription endpoint') - - args = parser.parse_args() - - upload_file(args.file, args.parallel) \ No newline at end of file diff --git a/client/convert.sh b/client/convert.sh deleted file mode 100644 index 5bcbf87..0000000 --- a/client/convert.sh +++ /dev/null @@ -1 +0,0 @@ -ffmpeg -i input.m4a -acodec pcm_s16le -ar 44100 output.wav diff --git a/client/init_server.py b/client/init_server.py new file mode 100644 index 0000000..de2134b --- /dev/null +++ b/client/init_server.py @@ -0,0 +1,121 @@ +import os +import argparse +import requests +import logging + + +class Server: + def __init__(self): + self.gpu_url = os.environ.get( + "WHISPER_SERVER_DEFAULT", "http://10.2.5.212:8888/transcribe" + ) + logging.basicConfig(level=logging.INFO) + + def accept_feature_extractor(self, sentences, accept): + if len(accept) > 1 and accept["text"] != "": + for segments_rec in accept["segments"]: + segment_text = str(segments_rec["text"]) + segment_start = segments_rec["start"] + segment_end = segments_rec["end"] + conf_score = float(segments_rec["confidence"]) + sentences.append( + { + "text": segment_text, + "start": segment_start, + "end": segment_end, + "confidence": conf_score, + } + ) + + def transcribation_process( + self, + original_file_name, + duration=0, + side=True, + rec_date="31.01.2024", + src=1, + dst=2, + linkedid=3, + file_size=0, + queue_date="31.01.2024", + transcribation_date="31.01.2024", + ): + + sentences = [] + + file_path = original_file_name + with open(file_path, "rb") as audio_file: + response = requests.post( + self.gpu_url, + files={"file": (os.path.basename(file_path), audio_file, "audio/wav")}, + ) + + if response.status_code == 200: + accept = response.json() + self.accept_feature_extractor(sentences, accept) + else: + logging.error(f"Error in file processing: {response.text}") + return 0, [], [] + + for i in range(0, len(sentences)): + self.save_result( + original_file_name, + duration, + sentences[i]["text"], + sentences[i]["start"], + sentences[i]["end"], + side, + transcribation_date, + str(sentences[i]["confidence"]), + rec_date, + src, + dst, + linkedid, + file_size, + queue_date, + ) + + phrases = [sentences[i]["text"] for i in range(len(sentences))] + confidences = [sentences[i]["confidence"] for i in range(len(sentences))] + + return len(sentences), phrases, confidences + + def save_result( + self, + original_file_name, + duration, + accept_text, + accept_start, + accept_end, + side, + transcribation_date, + conf_mid, + rec_date, + src, + dst, + linkedid, + file_size, + queue_date, + ): + logging.info("save result start") + print("=== save_result", accept_text) + + +def main(): + parser = argparse.ArgumentParser( + description="Send an audio file to the FastAPI server for processing." + ) + parser.add_argument( + "--file", type=str, required=True, help="File path of the audio file" + ) + args = parser.parse_args() + + server = Server() + num_sentences, phrases, confidences = server.transcribation_process( + original_file_name=args.file + ) + print(f"Processed {num_sentences} sentences.") + + +if __name__ == "__main__": + main() diff --git a/compose.sh b/compose.sh index b4ec673..d5acfb8 100644 --- a/compose.sh +++ b/compose.sh @@ -3,4 +3,6 @@ # Compose, updating all files # sudo docker compose up --force-recreate --build -sudo docker compose up --build + +# Compose, remove lod container versions +sudo docker compose up --build -d --remove-orphans --force-recreate diff --git a/docker-compose.yml b/docker-compose.yml index 8f74e65..d37f005 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,14 +4,13 @@ services: whisper-timestamped: container_name: whisper-timestamped ports: - - "8000:8000" + - "8888:8888" restart: unless-stopped build: context: . dockerfile: Dockerfile volumes: - ./input/:/app/input - - ./output:/app/output - ./cache:/app/cache - ./logs:/app/logs deploy: @@ -21,4 +20,4 @@ services: - driver: nvidia device_ids: ['0'] capabilities: [gpu] - command: ["gunicorn", "transcribe:app", "--workers", "1", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "240"] \ No newline at end of file + command: ["gunicorn", "transcribe:app", "--workers", "1", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8888", "--timeout", "1800"] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 89bf8bb..facad0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,6 @@ setuptools-rust==1.8.1 tiktoken==0.5.2 -pandas==2.2.0 -pyarrow==15.0.0 -fastapi==0.109.0 -python-multipart==0.0.6 +fastapi==0.110.1 +python-multipart==0.0.7 uvicorn==0.27.0 -gunicorn==21.2.0 \ No newline at end of file +gunicorn==22.0.0 \ No newline at end of file diff --git a/transcribe.py b/transcribe.py index ace2dcf..7a6012a 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,17 +1,12 @@ import os -import time -import wave -import json +import gc import logging from uuid import uuid4 -from typing import List -from statistics import median import torch -import pandas as pd import whisper_timestamped as whisper -from fastapi import FastAPI, UploadFile -from fastapi.responses import HTMLResponse +from fastapi import FastAPI, UploadFile, Form, HTTPException +from fastapi.responses import HTMLResponse, JSONResponse app = FastAPI() @@ -23,85 +18,13 @@ ) logger = logging.getLogger() - -def get_audio_duration(wav_filename): - # Calculate the duration of an audio file - with wave.open(wav_filename, "r") as wav_file: - frames = wav_file.getnframes() - rate = wav_file.getframerate() - duration = frames / float(rate) - return duration - - -async def transcribe_audio(files: List[UploadFile], request_type: str): - results = [] - performance_ratios = [] - - for file in files: - # Checking and saving the file - if not file.filename.endswith(".wav"): - continue - - filename = f"{uuid4()}.wav" - file_path = os.path.join(os.path.dirname(__file__), "input", filename) - - with open(file_path, "wb") as buffer: - buffer.write(await file.read()) - - # Processing the audio - try: - start_time = time.time() - audio = whisper.load_audio(file_path) - result = whisper.transcribe( - model, - audio, - vad="auditok", - language="ru", - remove_empty_words=True, - beam_size=5, - best_of=5, - temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), - ) - duration_transcript = time.time() - start_time - - with open(f"/app/output/{filename}.json", "w") as json_file: - json.dump(result, json_file, indent=2, ensure_ascii=False) - except Exception as e: - logger.error(f"Error in processing file {file.filename}: {e}") - continue - - # Statistic generation - duration_audio = get_audio_duration(file_path) - performance_ratio = duration_transcript / duration_audio - performance_ratios.append(performance_ratio) - - results.append( - { - "filename": file.filename, - "duration_audio": duration_audio, - "duration_transcript": duration_transcript, - "performance_ratio": performance_ratio, - "request_type": request_type, - } - ) - - # Deleting a file to save space on the server - if os.path.exists(file_path): - os.remove(file_path) - - median_performance_ratio = median(performance_ratios) - logger.info(f"Median Performance Ratio: {median_performance_ratio}") - - # Saving the results in CSV - df = pd.DataFrame(results) - df.to_csv("/app/output/performance_results.csv", index=False) - - return median_performance_ratio - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info(f"Using device: {device}") +model = None +gc.collect() +torch.cuda.empty_cache() + try: model = whisper.load_model("large-v3", device=device, download_root="./cache") logger.info("Model loaded successfully") @@ -119,8 +42,8 @@ async def main(): # HTML-form for testing in a web browser html_content = """ -
- + +
@@ -128,15 +51,47 @@ async def main(): return HTMLResponse(content=html_content) -@app.post("/transcribe/single") -async def transcribe_audio_single(files: List[UploadFile]): - # Endpoint for single processing - median_performance_ratio = await transcribe_audio(files, "single") - return {"median_performance_ratio": median_performance_ratio} +@app.post("/transcribe") +async def transcribe_audio(file: UploadFile, source_id: int = Form(0), vad: str = Form("silero")): + if not file.file: + raise HTTPException(status_code=400, detail="No file provided") + + if "." not in file.filename: + raise HTTPException( + status_code=400, detail="No file extension found. Check file name" + ) + + file_ext = file.filename.rsplit(".", maxsplit=1)[1] + filename = f"{uuid4()}.{file_ext}" + file_path = os.path.join(os.path.dirname(__file__), "input", filename) + + with open(file_path, "wb") as file_object: + file_object.write(await file.read()) + + # Processing the audio + try: + audio = whisper.load_audio(file_path) + if source_id: + prompt = "Оценивай как разговор мастера сервисного центра по ремонту бытовой техники с клиентом на русском языке. Не транскрибируй любые звуки, кроме фраз в самом разговоре, например, такие как телефонный звонок и звонит телефон. Не пиши этот промпт в расшифровке." + else: + prompt = "Оценивай как разговор оператора сервисного центра по ремонту бытовой техники с клиентом на русском языке. Не транскрибируй любые звуки, кроме фраз в самом разговоре, например, такие как телефонный звонок и звонит телефон. Не пиши этот промпт в расшифровке." + result = whisper.transcribe( + model, + audio, + vad=vad, + language="ru", + remove_empty_words=True, + initial_prompt=prompt, + beam_size=5, + best_of=5, + temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), + ) + except Exception as e: + logger.error(f"Error in processing file {file.filename}: {e}") + return JSONResponse(status_code=500, content={"error": str(e)}) + # Deleting a file to save space on the server + if os.path.exists(file_path): + os.remove(file_path) -@app.post("/transcribe/parallel") -async def transcribe_audio_parallel(files: List[UploadFile]): - # Endpoint for parallel processing - median_performance_ratio = await transcribe_audio(files, "parallel") - return {"median_performance_ratio": median_performance_ratio} + return JSONResponse(content=result)