Skip to content

Commit

Permalink
validating filtering, updated whisper performance
Browse files Browse the repository at this point in the history
  • Loading branch information
Darveivoldavara committed Mar 19, 2024
1 parent 08ce725 commit b4e8e3a
Show file tree
Hide file tree
Showing 3 changed files with 338 additions and 54,529 deletions.
11 changes: 6 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,9 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

.pt
.log
.wav
.mp3
.m4a
**.pass
**..pt
**..log
**..wav
**..mp3
**..m4a
228 changes: 228 additions & 0 deletions notebooks/validating_audio_duplicate_filtering_efficiency.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Проверка корректности и эффективности фильтрации дублирующих записей"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"import pandas as pd\n",
"import pymssql\n",
"import plotly.graph_objs as go\n",
"import plotly.io as pio\n",
"\n",
"pd.set_option('display.max_colwidth', None)\n",
"pio.renderers.default = \"notebook_connected\"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def ms_sql_con():\n",
" sql_name = 'voice_ai'\n",
" sql_server = '10.2.4.124'\n",
" sql_login = 'ICECORP\\\\1c_sql'\n",
"\n",
" with open('sql.pass','r') as file:\n",
" sql_pass = file.read().replace('\\n', '')\n",
" file.close()\n",
"\n",
" return pymssql.connect(\n",
" server = sql_server,\n",
" user = sql_login,\n",
" password = sql_pass,\n",
" database = sql_name,\n",
" tds_version=r'7.0'\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def read_sql(query):\n",
" return pd.read_sql_query(query, con=ms_sql_con(), parse_dates=None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"query_old = '''\n",
"SELECT *\n",
"FROM (SELECT DISTINCT\n",
" linkedid call_id\n",
" FROM calls\n",
" WHERE CAST(call_date AS DATE) BETWEEN '2024-02-08' AND '2024-02-27'\n",
" AND linkedid IS NOT NULL AND linkedid <> '') c\n",
"LEFT JOIN (SELECT DISTINCT\n",
" linkedid transcribation_id\n",
" FROM transcribations\n",
" WHERE CAST(transcribation_date AS DATE) BETWEEN '2024-02-08' AND '2024-02-27') t\n",
"ON c.call_id = t.transcribation_id;\n",
"'''\n",
"\n",
"query_new = '''\n",
"SELECT *\n",
"FROM (SELECT DISTINCT\n",
" linkedid call_id\n",
" FROM calls\n",
" WHERE CAST(call_date AS DATE) BETWEEN '2024-02-28' AND '2024-03-18'\n",
" AND linkedid IS NOT NULL AND linkedid <> '') c\n",
"LEFT JOIN (SELECT DISTINCT\n",
" linkedid transcribation_id\n",
" FROM transcribations\n",
" WHERE CAST(transcribation_date AS DATE) BETWEEN '2024-02-28' AND '2024-03-18') t\n",
"ON c.call_id = t.transcribation_id;\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"old = read_sql(query_old)\n",
"new = read_sql(query_new)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Процент транскрипций в разговорах до фильтрации на дубли - 99.52%\n"
]
}
],
"source": [
"print(f'Процент транскрипций в разговорах до фильтрации на дубли - {len(old[old.transcribation_id.notna()]) / len(old) * 100:.2f}%'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Процент транскрипций в разговорах после фильтрации на дубли - 99.49%\n"
]
}
],
"source": [
"print(f'Процент транскрипций в разговорах после фильтрации на дубли - {len(new[new.transcribation_id.notna()]) / len(new) * 100:.2f}%'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"query_old_queue = '''\n",
"SELECT AVG(time)\n",
"FROM perf_log\n",
"WHERE CAST(event_date AS DATE) BETWEEN '2024-02-08' AND '2024-02-27';\n",
"'''\n",
"\n",
"query_new_queue = '''\n",
"SELECT AVG(time)\n",
"FROM perf_log\n",
"WHERE CAST(event_date AS DATE) BETWEEN '2024-02-28' AND '2024-03-18';\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"old_queue = read_sql(query_old_queue)\n",
"new_queue = read_sql(query_new_queue)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средняя длительность очереди до фильтрации на дубли - 33.91 сек.\n"
]
}
],
"source": [
"print(f'Средняя длительность очереди до фильтрации на дубли - {old_queue.values[0][0]:.2f} сек.')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средняя длительность очереди после фильтрации на дубли - 33.24 сек.\n"
]
}
],
"source": [
"print(f'Средняя длительность очереди после фильтрации на дубли - {new_queue.values[0][0]:.2f} сек.')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "text-chats-clustering-2ZPHndiz-py3.11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit b4e8e3a

Please sign in to comment.