Merge pull request #77 from AnFreTh/bugfix/datasets

Bugfix/datasets
AnFreTh · Aug 9, 2024 · a04931d · a04931d
2 parents acfa8be + 17ac1be
commit a04931d
Show file tree

Hide file tree

Showing 31 changed files with 695 additions and 664 deletions.
diff --git a/.gitignore b/.gitignore
@@ -183,4 +183,11 @@ post-checkout
 post-commit
 post-merge
 pre-push
-docs/notebooks/lightning_logs/*
+docs/notebooks/lightning_logs/*
+docs/notebooks/lightning_logs
+docs/notebooks/data
+docs/notebooks/data/*
+docs/notebooks/embeddings
+docs/notebooks/embeddings/*
+docs/notebooks/checkpoints
+docs/notebooks/checkpoints/*
diff --git a/docs/notebooks/datasets.ipynb b/docs/notebooks/datasets.ipynb
@@ -50,8 +50,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "  warnings.warn(\n"
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
+      "  from tqdm.autonotebook import tqdm, trange\n"
      ]
     }
    ],
@@ -76,57 +76,27 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Stocktwits_GME_large',\n",
-       " 'BBC_News',\n",
-       " 'Stocktwits_GME',\n",
-       " 'Reddit_GME',\n",
-       " 'Reuters',\n",
-       " 'Spotify',\n",
-       " '20NewsGroups',\n",
-       " 'DummyDataset',\n",
-       " 'Spotify_most_popular',\n",
-       " 'Poliblogs',\n",
-       " 'Spotify_least_popular']"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset = TMDataset()\n",
-    "dataset.get_dataset_list()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-08 22:59:47.537\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:47.907\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:48.146\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:48.510\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:48.690\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
    "source": [
+    "dataset = TMDataset()\n",
     "dataset.fetch_dataset(name=\"Reuters\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -142,7 +112,7 @@
        " array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -153,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -169,7 +139,7 @@
        " array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -180,7 +150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,20 +159,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m158\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:50.219\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:50.521\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:50.871\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:51.061\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
@@ -212,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -422,7 +392,7 @@
        "4  [alexia, and, ice, you, the, came, down, you, ...  "
       ]
      },
-     "execution_count": 9,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -433,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -443,7 +413,7 @@
        " 'seinabo sey have always wondered your cause when you you can not and ive been you your just see one seinabo sey will you ever tell what really your mind cause the greatest love youll ever find seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving moving have always wondered you your like every word the ive always been and every word but the seinabo sey vargas lagola will you ever tell what really your mind cause the greatest love youll ever find you know seinabo sey vargas lagola seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving cause not seinabo sey vargas lagola seinabo sey and you cant hold heart heart heart heart and you cant hold heart heart heart heart you cant hold heart heart hold heart and you cant hold heart heart heart and you cant hold heart heart you cant you cant you cant you cant hold heart heart heart heart heart moving moving ohohooh ohohooh ohohooh']"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -454,7 +424,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -463,7 +433,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -472,7 +442,7 @@
        "[75, 58]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -490,7 +460,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,17 +484,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 258.65it/s]\n",
-      "\u001b[32m2024-08-08 22:59:55.068\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m454\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:55.073\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m469\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:55.074\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n"
+      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n",
+      "\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n"
      ]
     }
    ],
@@ -541,15 +510,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-08 22:59:55.077\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
-      "\u001b[32m2024-08-08 22:59:55.078\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
      ]
     }
    ],
@@ -561,7 +530,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -634,7 +603,7 @@
        "4  BGHXO      3  [BGHXO]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -643,6 +612,13 @@
     "dataset.dataframe.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,