-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
validating filtering, updated whisper performance
- Loading branch information
1 parent
08ce725
commit b4e8e3a
Showing
3 changed files
with
338 additions
and
54,529 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
228 changes: 228 additions & 0 deletions
228
notebooks/validating_audio_duplicate_filtering_efficiency.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Проверка корректности и эффективности фильтрации дублирующих записей" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import warnings\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import pymssql\n", | ||
"import plotly.graph_objs as go\n", | ||
"import plotly.io as pio\n", | ||
"\n", | ||
"pd.set_option('display.max_colwidth', None)\n", | ||
"pio.renderers.default = \"notebook_connected\"\n", | ||
"warnings.filterwarnings(\"ignore\", category=UserWarning)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def ms_sql_con():\n", | ||
" sql_name = 'voice_ai'\n", | ||
" sql_server = '10.2.4.124'\n", | ||
" sql_login = 'ICECORP\\\\1c_sql'\n", | ||
"\n", | ||
" with open('sql.pass','r') as file:\n", | ||
" sql_pass = file.read().replace('\\n', '')\n", | ||
" file.close()\n", | ||
"\n", | ||
" return pymssql.connect(\n", | ||
" server = sql_server,\n", | ||
" user = sql_login,\n", | ||
" password = sql_pass,\n", | ||
" database = sql_name,\n", | ||
" tds_version=r'7.0'\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def read_sql(query):\n", | ||
" return pd.read_sql_query(query, con=ms_sql_con(), parse_dates=None)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"query_old = '''\n", | ||
"SELECT *\n", | ||
"FROM (SELECT DISTINCT\n", | ||
" linkedid call_id\n", | ||
" FROM calls\n", | ||
" WHERE CAST(call_date AS DATE) BETWEEN '2024-02-08' AND '2024-02-27'\n", | ||
" AND linkedid IS NOT NULL AND linkedid <> '') c\n", | ||
"LEFT JOIN (SELECT DISTINCT\n", | ||
" linkedid transcribation_id\n", | ||
" FROM transcribations\n", | ||
" WHERE CAST(transcribation_date AS DATE) BETWEEN '2024-02-08' AND '2024-02-27') t\n", | ||
"ON c.call_id = t.transcribation_id;\n", | ||
"'''\n", | ||
"\n", | ||
"query_new = '''\n", | ||
"SELECT *\n", | ||
"FROM (SELECT DISTINCT\n", | ||
" linkedid call_id\n", | ||
" FROM calls\n", | ||
" WHERE CAST(call_date AS DATE) BETWEEN '2024-02-28' AND '2024-03-18'\n", | ||
" AND linkedid IS NOT NULL AND linkedid <> '') c\n", | ||
"LEFT JOIN (SELECT DISTINCT\n", | ||
" linkedid transcribation_id\n", | ||
" FROM transcribations\n", | ||
" WHERE CAST(transcribation_date AS DATE) BETWEEN '2024-02-28' AND '2024-03-18') t\n", | ||
"ON c.call_id = t.transcribation_id;\n", | ||
"'''" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"old = read_sql(query_old)\n", | ||
"new = read_sql(query_new)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Процент транскрипций в разговорах до фильтрации на дубли - 99.52%\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(f'Процент транскрипций в разговорах до фильтрации на дубли - {len(old[old.transcribation_id.notna()]) / len(old) * 100:.2f}%'\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Процент транскрипций в разговорах после фильтрации на дубли - 99.49%\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(f'Процент транскрипций в разговорах после фильтрации на дубли - {len(new[new.transcribation_id.notna()]) / len(new) * 100:.2f}%'\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"query_old_queue = '''\n", | ||
"SELECT AVG(time)\n", | ||
"FROM perf_log\n", | ||
"WHERE CAST(event_date AS DATE) BETWEEN '2024-02-08' AND '2024-02-27';\n", | ||
"'''\n", | ||
"\n", | ||
"query_new_queue = '''\n", | ||
"SELECT AVG(time)\n", | ||
"FROM perf_log\n", | ||
"WHERE CAST(event_date AS DATE) BETWEEN '2024-02-28' AND '2024-03-18';\n", | ||
"'''" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"old_queue = read_sql(query_old_queue)\n", | ||
"new_queue = read_sql(query_new_queue)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Средняя длительность очереди до фильтрации на дубли - 33.91 сек.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(f'Средняя длительность очереди до фильтрации на дубли - {old_queue.values[0][0]:.2f} сек.')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Средняя длительность очереди после фильтрации на дубли - 33.24 сек.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(f'Средняя длительность очереди после фильтрации на дубли - {new_queue.values[0][0]:.2f} сек.')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "text-chats-clustering-2ZPHndiz-py3.11", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.