Skip to content

Commit

Permalink
Merge pull request #77 from AnFreTh/bugfix/datasets
Browse files Browse the repository at this point in the history
Bugfix/datasets
  • Loading branch information
AnFreTh authored Aug 9, 2024
2 parents acfa8be + 17ac1be commit a04931d
Show file tree
Hide file tree
Showing 31 changed files with 695 additions and 664 deletions.
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,11 @@ post-checkout
post-commit
post-merge
pre-push
docs/notebooks/lightning_logs/*
docs/notebooks/lightning_logs/*
docs/notebooks/lightning_logs
docs/notebooks/data
docs/notebooks/data/*
docs/notebooks/embeddings
docs/notebooks/embeddings/*
docs/notebooks/checkpoints
docs/notebooks/checkpoints/*
114 changes: 45 additions & 69 deletions docs/notebooks/datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
"/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
Expand All @@ -76,57 +76,27 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Stocktwits_GME_large',\n",
" 'BBC_News',\n",
" 'Stocktwits_GME',\n",
" 'Reddit_GME',\n",
" 'Reuters',\n",
" 'Spotify',\n",
" '20NewsGroups',\n",
" 'DummyDataset',\n",
" 'Spotify_most_popular',\n",
" 'Poliblogs',\n",
" 'Spotify_least_popular']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = TMDataset()\n",
"dataset.get_dataset_list()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-08 22:59:47.537\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:47.907\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:48.146\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:48.510\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:48.690\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
]
}
],
"source": [
"dataset = TMDataset()\n",
"dataset.fetch_dataset(name=\"Reuters\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -142,7 +112,7 @@
" array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -153,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -169,7 +139,7 @@
" array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))"
]
},
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -180,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -189,20 +159,20 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m158\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:49.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:50.219\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m715\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:50.521\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m717\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:50.871\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m744\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:51.061\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m746\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/ folder\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
]
}
],
Expand All @@ -212,7 +182,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -422,7 +392,7 @@
"4 [alexia, and, ice, you, the, came, down, you, ... "
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -433,7 +403,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -443,7 +413,7 @@
" 'seinabo sey have always wondered your cause when you you can not and ive been you your just see one seinabo sey will you ever tell what really your mind cause the greatest love youll ever find seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving moving have always wondered you your like every word the ive always been and every word but the seinabo sey vargas lagola will you ever tell what really your mind cause the greatest love youll ever find you know seinabo sey vargas lagola seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving cause not seinabo sey vargas lagola seinabo sey and you cant hold heart heart heart heart and you cant hold heart heart heart heart you cant hold heart heart hold heart and you cant hold heart heart heart and you cant hold heart heart you cant you cant you cant you cant hold heart heart heart heart heart moving moving ohohooh ohohooh ohohooh']"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -454,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -463,7 +433,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -472,7 +442,7 @@
"[75, 58]"
]
},
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -490,7 +460,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -514,17 +484,16 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 258.65it/s]\n",
"\u001b[32m2024-08-08 22:59:55.068\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m454\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:55.073\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m469\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:55.074\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n"
"Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n",
"\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n"
]
}
],
Expand All @@ -541,15 +510,15 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-08 22:59:55.077\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
"\u001b[32m2024-08-08 22:59:55.078\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
]
}
],
Expand All @@ -561,7 +530,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -634,7 +603,7 @@
"4 BGHXO 3 [BGHXO]"
]
},
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -643,6 +612,13 @@
"dataset.dataframe.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit a04931d

Please sign in to comment.