Skip to content

Commit

Permalink
refactor: Fix and simplify benchmark processing notebook (#125)
Browse files Browse the repository at this point in the history
* feat: Fix process benchmarks notebook

* refactor: Fix and simplify benchmark processing notebook

* fix: Backward comp for processing script

* feat: Add params
  • Loading branch information
KShivendu authored Apr 16, 2024
1 parent 5f2121e commit b7ec57e
Showing 1 changed file with 63 additions and 57 deletions.
120 changes: 63 additions & 57 deletions scripts/process-benchmarks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -93,38 +93,44 @@
" match = PATH_REGEX.match(path.name)\n",
" if match is None:\n",
" continue\n",
" \n",
"\n",
" experiment = match.groupdict()\n",
" \n",
"\n",
" with open(path, \"r\") as fp:\n",
" stats = json.load(fp)\n",
"\n",
" entry = [match[\"engine\"], match[\"m\"], match[\"ef\"], \n",
" match[\"dataset\"], match[\"search_index\"], match[\"date\"], \n",
" stats[\"params\"], stats[\"results\"]]\n",
" params = stats[\"params\"]\n",
" dataset = params.pop(\"dataset\")\n",
" engine = params.pop(\"engine\")\n",
"\n",
" entry = {\n",
" \"dataset\": dataset,\n",
" \"engine\": engine,\n",
" \"m\": match[\"m\"],\n",
" \"ef\": match[\"ef\"],\n",
" \"date\": match[\"date\"],\n",
" \"params\": params,\n",
" \"results\": stats[\"results\"],\n",
" }\n",
"\n",
" if experiment[\"operation\"] == \"search\":\n",
" entry.update({\"search_index\": match[\"search_index\"]})\n",
" search_results.append(entry)\n",
" elif experiment[\"operation\"] == \"upload\":\n",
" upload_results.append(entry)\n",
" else:\n",
" raise Exception(\"Unknown operation\")\n",
"\n",
"len(upload_results), len(search_results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2022-08-05T10:03:54.157465Z",
"start_time": "2022-08-05T10:03:54.153118Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"metadata": {},
"outputs": [],
"source": [
"column_names = [\"engine\", \"m\", \"ef\", \"dataset\", \"search_index\", \"date\", \"params\", \"results\"]"
"upload_results, search_results[0]"
]
},
{
Expand All @@ -141,18 +147,15 @@
},
"outputs": [],
"source": [
"upload_df = pd.DataFrame(upload_results, columns=column_names) \\\n",
" .drop(columns=\"search_index\")\n",
"upload_df = pd.DataFrame(upload_results)\n",
"upload_df[\"date\"] = pd.to_datetime(upload_df[\"date\"], format=\"%Y-%m-%d-%H-%M-%S\")\n",
"upload_df = upload_df.sort_values(\"date\", ascending=False) \\\n",
" .groupby([\"engine\", \"m\", \"ef\", \"dataset\"]) \\\n",
" .last()\n",
"upload_df = pd.concat([upload_df, upload_df[\"results\"].apply(pd.Series)], axis=1)\n",
"upload_df = upload_df.drop(columns=\"results\")\n",
"\n",
"print(len(upload_df))\n",
" .first()\n",
"\n",
"upload_df.sort_values(\"total_time\", ascending=True).head(n=5)"
"temp_df = upload_df.copy()\n",
"temp_df[\"total_time\"] = temp_df[\"results\"].apply(lambda x: x[\"total_time\"])\n",
"temp_df.sort_values(\"total_time\", ascending=True).head(n=5)"
]
},
{
Expand All @@ -169,18 +172,15 @@
},
"outputs": [],
"source": [
"search_df = pd.DataFrame(search_results, columns=column_names)\n",
"search_df = pd.DataFrame(search_results)\n",
"search_df[\"date\"] = pd.to_datetime(search_df[\"date\"], format=\"%Y-%m-%d-%H-%M-%S\")\n",
"search_df = search_df.sort_values(\"date\", ascending=False) \\\n",
" .groupby([\"engine\", \"m\", \"ef\", \"dataset\", \"search_index\"]) \\\n",
" .first()\n",
"\n",
"print(len(search_df))\n",
"\n",
"for column_name in [\"params\", \"results\"]:\n",
" search_df = pd.concat([search_df, search_df[column_name].apply(pd.Series)], axis=1)\n",
" search_df = search_df.drop(columns=column_name)\n",
"search_df.sort_values(\"rps\", ascending=False).head(n=10)"
"temp_df = search_df.copy()\n",
"temp_df['rps'] = temp_df['results'].apply(lambda x: x[\"rps\"])\n",
"temp_df.sort_values(\"rps\", ascending=False).head(n=10)"
]
},
{
Expand All @@ -203,50 +203,56 @@
"metadata": {},
"outputs": [],
"source": [
"json_all = []\n",
"json_1_or_100_thread = []\n",
"json_results = []\n",
"\n",
"for index, row in joined_df.reset_index().iterrows():\n",
" engine_params = {}\n",
" if isinstance(row['search_params'], dict):\n",
" engine_params.update(row['search_params'])\n",
" if isinstance(row['params'], dict):\n",
" engine_params.update(row['params'])\n",
" \n",
" if isinstance(row['params_upload'], dict):\n",
" engine_params.update(row['params_upload'])\n",
" if isinstance(row['params_search'], dict):\n",
" search_params = row['params_search']\n",
" engine_params.update(search_params.get('config', {}))\n",
" engine_params.update(search_params.get('params', {}))\n",
" engine_params.update(search_params.get('search_params', {}))\n",
" engine_params.update(search_params.get('vectorIndexConfig', {}))\n",
"\n",
" engine_params.pop('experiment')\n",
" engine_params.pop('parallel')\n",
"\n",
" engine_name = row['engine']\n",
"\n",
" if engine_name == \"qdrant-rps\" or engine_name == \"qdrant-bq-rps\" or engine_name == \"qdrant-sq-rps\":\n",
" if engine_name.startswith(\"qdrant-\"):\n",
" engine_name = \"qdrant\"\n",
"\n",
" json_object = {\n",
" \"engine_name\": engine_name,\n",
" \"setup_name\": f\"{row['engine']}-m-{row['m']}-ef-{row['ef']}\",\n",
" \"setup_name\": f\"{row['params_search']['experiment']}\",\n",
" \"dataset_name\": row['dataset'],\n",
" # \"search_idx\": row['search_index'],\n",
" \"upload_time\": row['upload_time'],\n",
" \"total_upload_time\": row['total_time_upload'],\n",
" \"p95_time\": row['p95_time'],\n",
" \"rps\": row['rps'],\n",
" \"parallel\": row['parallel'],\n",
" \"p99_time\": row['p99_time'],\n",
" \"mean_time\": row['mean_time'],\n",
" \"mean_precisions\": row['mean_precisions'],\n",
" \"search_idx\": row['search_index'],\n",
" \"upload_time\": row['results_upload']['upload_time'],\n",
" \"total_upload_time\": row['results_upload']['total_time'],\n",
" \"p95_time\": row['results_search']['p95_time'],\n",
" \"rps\": row['results_search']['rps'],\n",
" \"parallel\": row['params_search']['parallel'],\n",
" \"p99_time\": row['results_search']['p99_time'],\n",
" \"mean_time\": row['results_search']['mean_time'],\n",
" \"mean_precisions\": row['results_search']['mean_precisions'],\n",
" \"engine_params\": engine_params,\n",
" }\n",
" json_all.append(json_object)\n",
" \n",
" parallel = row['parallel']\n",
" json_results.append(json_object)\n",
"\n",
" if parallel == 1 or parallel == 100:\n",
" json_1_or_100_thread.append(json_object)\n",
"\n",
"format = '%Y-%M-%d' # T%H:%M:%S\n",
"format = '%Y-%M-%dT%H:%M:%S'\n",
"now = datetime.now().replace(tzinfo=timezone.utc).strftime(format)\n",
"\n",
"Path(f\"results-{now}.json\").write_text(json.dumps(json_all, indent=2))\n",
"Path(f\"results-1-100-threads-{now}.json\").write_text(json.dumps(json_1_or_100_thread, indent=2))\n",
"Path(f\"results.json\").write_text(json.dumps(json_results, indent=2))\n",
"Path(f\"results-{now}.json\").write_text(json.dumps(json_results, indent=2))\n",
"\n",
"print(json_results[-1], len(json_results))\n",
"\n",
"json_1_or_100_thread[-1], len(json_all), len(json_1_or_100_thread)"
"results_df = pd.DataFrame(json_results).sort_values(\"p99_time\", ascending=True)\n",
"# results_df.to_csv('results.csv')\n",
"results_df"
]
}
],
Expand Down

0 comments on commit b7ec57e

Please sign in to comment.