report

cadia-lvl · Dec 14, 2023 · 0054abe · 0054abe
1 parent 3ae91d4
commit 0054abe
Showing 1 changed file with 44 additions and 63 deletions.
diff --git a/src/generate_report.ipynb b/src/generate_report.ipynb
@@ -2,105 +2,86 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.5) or chardet (4.0.0) doesn't match a supported version!\n",
-      "  warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(1111, 2)\n",
-      "(209, 2)\n",
-      "0     g trúi því varla að maður hafi horft á þetta t...\n",
-      "1     etta hefði alveg getað verið Bridgerton þáttur...\n",
-      "2     Mjög kjánaleg mynd, eins og við mátti búast  b...\n",
-      "3     Mikil vitleysa. Mikið um slapstick og fratboy ...\n",
-      "4     Alveg öfugt við það sem að gerði upprunalegu s...\n",
-      "                            ...                        \n",
-      "95    Svo. Mikið. Ofbeldi. Svakaleg keyrsla. Keanu R...\n",
-      "96    Mjög skemmtileg mynd. Ekta Bond  spennandi, fy...\n",
-      "97    Nokkuð góð skrímslastórslysamynd. Töluvert öðr...\n",
-      "98    Jahá... erfitt að segja  margt einkennilegt vi...\n",
-      "99    Mjög skemmtileg mynd. Fyndin. Sniðug saga. Hug...\n",
-      "Name: review, Length: 100, dtype: object\n",
-      "0     0\n",
-      "1     0\n",
-      "2     0\n",
-      "3     0\n",
-      "4     0\n",
-      "     ..\n",
-      "95    1\n",
-      "96    1\n",
-      "97    1\n",
-      "98    1\n",
-      "99    1\n",
-      "Name: sentiment, Length: 100, dtype: int64\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-11-29 21:38:43.624646: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2023-11-29 21:38:43.667135: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-11-29 21:38:44.444958: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
-      "/home/olafurj/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2418: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
-      "  warnings.warn(\n"
+     "ename": "HFValidationError",
+     "evalue": "Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../models/electra-base-google-batch8-remove-noise-model/'. Use `repo_type` argument if needed.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mHFValidationError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_161373/1743029994.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     57\u001b[0m     \u001b[0mX_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreview\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     58\u001b[0m     \u001b[0my_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msentiment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m     \u001b[0maccuracy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgcr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcall_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mMODEL\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDEVICE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     60\u001b[0m     \u001b[0mtotal\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     61\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"acc: {0:.4f}, seed: {1}, i: {2}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccuracy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/dev/sentiment-analysis/src/generate_classification_report.py\u001b[0m in \u001b[0;36mcall_model\u001b[0;34m(X_all, y_all, folder, device, accuracy)\u001b[0m\n\u001b[1;32m    157\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcall_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_all\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfolder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 159\u001b[0;31m     \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAutoModelForSequenceClassification\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    160\u001b[0m     \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    161\u001b[0m     \u001b[0mtokenizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfolder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py\u001b[0m in \u001b[0;36mfrom_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m    492\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPretrainedConfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    493\u001b[0m                 \u001b[0;31m# We make a call to the config file first (which may be absent) to get the commit hash as soon as possible\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 494\u001b[0;31m                 resolved_config_file = cached_file(\n\u001b[0m\u001b[1;32m    495\u001b[0m                     \u001b[0mpretrained_model_name_or_path\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    496\u001b[0m                     \u001b[0mCONFIG_NAME\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/utils/hub.py\u001b[0m in \u001b[0;36mcached_file\u001b[0;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[1;32m    427\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    428\u001b[0m         \u001b[0;31m# Load from URL or cache if already cached\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m         resolved_file = hf_hub_download(\n\u001b[0m\u001b[1;32m    430\u001b[0m             \u001b[0mpath_or_repo_id\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    431\u001b[0m             \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.local/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\u001b[0m in \u001b[0;36m_inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    108\u001b[0m         ):\n\u001b[1;32m    109\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0marg_name\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"repo_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"from_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"to_id\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 110\u001b[0;31m                 \u001b[0mvalidate_repo_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    112\u001b[0m             \u001b[0;32melif\u001b[0m \u001b[0marg_name\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"token\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0marg_value\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/.local/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\u001b[0m in \u001b[0;36mvalidate_repo_id\u001b[0;34m(repo_id)\u001b[0m\n\u001b[1;32m    156\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    157\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mrepo_id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 158\u001b[0;31m         raise HFValidationError(\n\u001b[0m\u001b[1;32m    159\u001b[0m             \u001b[0;34m\"Repo id must be in the form 'repo_name' or 'namespace/repo_name':\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    160\u001b[0m             \u001b[0;34mf\" '{repo_id}'. Use `repo_type` argument if needed.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mHFValidationError\u001b[0m: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../models/electra-base-google-batch8-remove-noise-model/'. Use `repo_type` argument if needed."
      ]
     }
    ],
    "source": [
     "import pandas as pd\n",
     "import random\n",
     "import generate_classification_report as gcr\n",
+    "import gc\n",
+    "import torch\n",
+    "\n",
+    "gc.collect()\n",
+    "torch.cuda.empty_cache()\n",
+    "\n",
+    "# Set path of CSVs here(can also be just one path, but need to edit this file)\n",
+    "PATH1 = \"../Datasets/IMDB-Dataset-GoogleTranslate.csv\"\n",
+    "PATH2 = \"../Datasets/IMDB-Dataset-MideindTranslate.csv\"\n",
     "\n",
-    "PATH1 = \"\"\n",
-    "PATH2 = \"\"\n",
+    "# Device and model to use\n",
+    "DEVICE = \"cuda\"\n",
+    "MODEL = \"../Models/electra-base-google-batch8-remove-noise-model/\"\n",
+    "\n",
+    "# Columns to drop from CSVs (if applicable)\n",
+    "DROP_COLUMNS = [\"num\", \"rating\", \"id\", \"movie\", \"rating\"]\n",
     "\n",
     "d1 = pd.read_csv(PATH1)\n",
     "d2 = pd.read_csv(PATH2)\n",
-    "d1.drop([\"num\", \"rating\", \"id\"], axis=1, inplace=True)\n",
-    "d2.drop([\"movie\", \"rating\"], axis=1, inplace=True)\n",
     "\n",
     "\n",
-    "df_orig = pd.merge(d1, d2, how=\"outer\")\n",
+    "d1 = d1.sample(n=100)\n",
+    "d2 = d2.sample(n=100)\n",
+    "\n",
     "\n",
-    "device = \"cuda\"\n",
-    "model = \"../Models/electra-base-google-batch8-remove-noise-model/\"\n",
     "\n",
+    "for col in DROP_COLUMNS:\n",
+    "    if col in d1.columns:\n",
+    "        d1.drop(col, axis=1, inplace=True)\n",
+    "    if col in d2.columns:\n",
+    "        d2.drop(col, axis=1, inplace=True)\n",
     "\n",
+    "\n",
+    "df_orig = pd.merge(d1, d2, how=\"outer\")\n",
     "total = 0\n",
     "for i in range(0, 10):\n",
     "    r = random.randint(0, 10000)\n",
     "\n",
     "    fifty_negative = (\n",
-    "        df_orig.where(lambda x: x[\"sentiment\"] == \"Negative\")\n",
+    "        df_orig.where(lambda x: x[\"sentiment\"] == \"negative\")\n",
     "        .dropna()\n",
     "        .sample(n=50, random_state=r)\n",
     "    )\n",
     "    fifty_positive = (\n",
-    "        df_orig.where(lambda x: x[\"sentiment\"] == \"Positive\")\n",
+    "        df_orig.where(lambda x: x[\"sentiment\"] == \"positive\")\n",
     "        .dropna()\n",
     "        .sample(n=50, random_state=r)\n",
     "    )\n",
     "\n",
     "    new_df = pd.merge(\n",
     "        fifty_negative, fifty_positive, on=[\"sentiment\", \"review\"], how=\"outer\"\n",
     "    )\n",
-    "    new_df.sentiment = new_df.sentiment.apply(lambda x: 1 if x == \"Positive\" else 0)\n",
+    "    new_df.sentiment = new_df.sentiment.apply(lambda x: 1 if x == \"positive\" else 0)\n",
     "    X_all = new_df.review\n",
     "    y_all = new_df.sentiment\n",
-    "    accuracy = gcr.call_model(X_all, y_all, model, device, accuracy=True)\n",
+    "    accuracy = gcr.call_model(X_all, y_all, MODEL, DEVICE, accuracy=True)\n",
     "    total += accuracy\n",
     "    print(\"acc: {0:.4f}, seed: {1}, i: {2}\".format(accuracy, r, i))\n",
     "\n",
@@ -130,7 +111,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.12"
   },
   "orig_nbformat": 4
  },