Skip to content

Commit

Permalink
report
Browse files Browse the repository at this point in the history
  • Loading branch information
olafurjohannsson committed Dec 14, 2023
1 parent 3ae91d4 commit 0054abe
Showing 1 changed file with 44 additions and 63 deletions.
107 changes: 44 additions & 63 deletions src/generate_report.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,105 +2,86 @@
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.5) or chardet (4.0.0) doesn't match a supported version!\n",
" warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1111, 2)\n",
"(209, 2)\n",
"0 g trúi því varla að maður hafi horft á þetta t...\n",
"1 etta hefði alveg getað verið Bridgerton þáttur...\n",
"2 Mjög kjánaleg mynd, eins og við mátti búast b...\n",
"3 Mikil vitleysa. Mikið um slapstick og fratboy ...\n",
"4 Alveg öfugt við það sem að gerði upprunalegu s...\n",
" ... \n",
"95 Svo. Mikið. Ofbeldi. Svakaleg keyrsla. Keanu R...\n",
"96 Mjög skemmtileg mynd. Ekta Bond spennandi, fy...\n",
"97 Nokkuð góð skrímslastórslysamynd. Töluvert öðr...\n",
"98 Jahá... erfitt að segja margt einkennilegt vi...\n",
"99 Mjög skemmtileg mynd. Fyndin. Sniðug saga. Hug...\n",
"Name: review, Length: 100, dtype: object\n",
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
" ..\n",
"95 1\n",
"96 1\n",
"97 1\n",
"98 1\n",
"99 1\n",
"Name: sentiment, Length: 100, dtype: int64\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-11-29 21:38:43.624646: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2023-11-29 21:38:43.667135: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2023-11-29 21:38:44.444958: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
"/home/olafurj/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2418: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
" warnings.warn(\n"
"ename": "HFValidationError",
"evalue": "Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../models/electra-base-google-batch8-remove-noise-model/'. Use `repo_type` argument if needed.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mHFValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_161373/1743029994.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0mX_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreview\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0my_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msentiment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m \u001b[0maccuracy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgcr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcall_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mMODEL\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDEVICE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 60\u001b[0m \u001b[0mtotal\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"acc: {0:.4f}, seed: {1}, i: {2}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccuracy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/dev/sentiment-analysis/src/generate_classification_report.py\u001b[0m in \u001b[0;36mcall_model\u001b[0;34m(X_all, y_all, folder, device, accuracy)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcall_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfolder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 159\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAutoModelForSequenceClassification\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 160\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mtokenizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py\u001b[0m in \u001b[0;36mfrom_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 492\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPretrainedConfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[0;31m# We make a call to the config file first (which may be absent) to get the commit hash as soon as possible\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 494\u001b[0;31m resolved_config_file = cached_file(\n\u001b[0m\u001b[1;32m 495\u001b[0m \u001b[0mpretrained_model_name_or_path\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 496\u001b[0m \u001b[0mCONFIG_NAME\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/utils/hub.py\u001b[0m in \u001b[0;36mcached_file\u001b[0;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;31m# Load from URL or cache if already cached\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m resolved_file = hf_hub_download(\n\u001b[0m\u001b[1;32m 430\u001b[0m \u001b[0mpath_or_repo_id\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\u001b[0m in \u001b[0;36m_inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 108\u001b[0m ):\n\u001b[1;32m 109\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0marg_name\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"repo_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"from_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"to_id\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 110\u001b[0;31m \u001b[0mvalidate_repo_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0marg_name\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"token\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0marg_value\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\u001b[0m in \u001b[0;36mvalidate_repo_id\u001b[0;34m(repo_id)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrepo_id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 158\u001b[0;31m raise HFValidationError(\n\u001b[0m\u001b[1;32m 159\u001b[0m \u001b[0;34m\"Repo id must be in the form 'repo_name' or 'namespace/repo_name':\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;34mf\" '{repo_id}'. Use `repo_type` argument if needed.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mHFValidationError\u001b[0m: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../models/electra-base-google-batch8-remove-noise-model/'. Use `repo_type` argument if needed."
]
}
],
"source": [
"import pandas as pd\n",
"import random\n",
"import generate_classification_report as gcr\n",
"import gc\n",
"import torch\n",
"\n",
"gc.collect()\n",
"torch.cuda.empty_cache()\n",
"\n",
"# Set path of CSVs here(can also be just one path, but need to edit this file)\n",
"PATH1 = \"../Datasets/IMDB-Dataset-GoogleTranslate.csv\"\n",
"PATH2 = \"../Datasets/IMDB-Dataset-MideindTranslate.csv\"\n",
"\n",
"PATH1 = \"\"\n",
"PATH2 = \"\"\n",
"# Device and model to use\n",
"DEVICE = \"cuda\"\n",
"MODEL = \"../Models/electra-base-google-batch8-remove-noise-model/\"\n",
"\n",
"# Columns to drop from CSVs (if applicable)\n",
"DROP_COLUMNS = [\"num\", \"rating\", \"id\", \"movie\", \"rating\"]\n",
"\n",
"d1 = pd.read_csv(PATH1)\n",
"d2 = pd.read_csv(PATH2)\n",
"d1.drop([\"num\", \"rating\", \"id\"], axis=1, inplace=True)\n",
"d2.drop([\"movie\", \"rating\"], axis=1, inplace=True)\n",
"\n",
"\n",
"df_orig = pd.merge(d1, d2, how=\"outer\")\n",
"d1 = d1.sample(n=100)\n",
"d2 = d2.sample(n=100)\n",
"\n",
"\n",
"device = \"cuda\"\n",
"model = \"../Models/electra-base-google-batch8-remove-noise-model/\"\n",
"\n",
"for col in DROP_COLUMNS:\n",
" if col in d1.columns:\n",
" d1.drop(col, axis=1, inplace=True)\n",
" if col in d2.columns:\n",
" d2.drop(col, axis=1, inplace=True)\n",
"\n",
"\n",
"df_orig = pd.merge(d1, d2, how=\"outer\")\n",
"total = 0\n",
"for i in range(0, 10):\n",
" r = random.randint(0, 10000)\n",
"\n",
" fifty_negative = (\n",
" df_orig.where(lambda x: x[\"sentiment\"] == \"Negative\")\n",
" df_orig.where(lambda x: x[\"sentiment\"] == \"negative\")\n",
" .dropna()\n",
" .sample(n=50, random_state=r)\n",
" )\n",
" fifty_positive = (\n",
" df_orig.where(lambda x: x[\"sentiment\"] == \"Positive\")\n",
" df_orig.where(lambda x: x[\"sentiment\"] == \"positive\")\n",
" .dropna()\n",
" .sample(n=50, random_state=r)\n",
" )\n",
"\n",
" new_df = pd.merge(\n",
" fifty_negative, fifty_positive, on=[\"sentiment\", \"review\"], how=\"outer\"\n",
" )\n",
" new_df.sentiment = new_df.sentiment.apply(lambda x: 1 if x == \"Positive\" else 0)\n",
" new_df.sentiment = new_df.sentiment.apply(lambda x: 1 if x == \"positive\" else 0)\n",
" X_all = new_df.review\n",
" y_all = new_df.sentiment\n",
" accuracy = gcr.call_model(X_all, y_all, model, device, accuracy=True)\n",
" accuracy = gcr.call_model(X_all, y_all, MODEL, DEVICE, accuracy=True)\n",
" total += accuracy\n",
" print(\"acc: {0:.4f}, seed: {1}, i: {2}\".format(accuracy, r, i))\n",
"\n",
Expand Down Expand Up @@ -130,7 +111,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.12"
},
"orig_nbformat": 4
},
Expand Down

0 comments on commit 0054abe

Please sign in to comment.