From ccfd2f62619556894b09390f8db7086bd6ab08d2 Mon Sep 17 00:00:00 2001 From: Adithya S K Date: Tue, 22 Oct 2024 14:43:17 +0530 Subject: [PATCH] updated RAG from scratch notebook - Adithya S K --- .../RAG_from_scratch.ipynb | 173 +++++++++++++++++- 1 file changed, 167 insertions(+), 6 deletions(-) diff --git a/RAG/00_RAG_from_Scratch/RAG_from_scratch.ipynb b/RAG/00_RAG_from_Scratch/RAG_from_scratch.ipynb index 4020547e..bfa13c45 100644 --- a/RAG/00_RAG_from_Scratch/RAG_from_scratch.ipynb +++ b/RAG/00_RAG_from_Scratch/RAG_from_scratch.ipynb @@ -61,7 +61,8 @@ "# !pip install -q scipy\n", "# !pip install openai\n", "# !pip install rich\n", - "!pip install pypdf2" + "# !pip install pypdf2\n", + "# !pip install gradio" ] }, { @@ -184,10 +185,167 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Adithya\\AppData\\Roaming\\Python\\Python311\\site-packages\\huggingface_hub\\utils\\_deprecation.py:131: FutureWarning: 'cached_download' (from 'huggingface_hub.file_download') is deprecated and will be removed from version '0.26'. Use `hf_hub_download` instead.\n", + " warnings.warn(warning_message, FutureWarning)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fa5fab01651b4b28aa549b38dd5e2a59", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + ".gitattributes: 0%| | 0.00/1.52k [00:00 710\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 712\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m SocketTimeout \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 713\u001b[0m \u001b[38;5;66;03m# FIXME: Ideally we'd like to include the url in the ReadTimeoutError but\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[38;5;66;03m# there is yet no clean way to get at it from this context.\u001b[39;00m\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\urllib3\\response.py:835\u001b[0m, in \u001b[0;36mHTTPResponse._raw_read\u001b[1;34m(self, amt)\u001b[0m\n\u001b[0;32m 825\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 826\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menforce_content_length\n\u001b[0;32m 827\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength_remaining \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 833\u001b[0m \u001b[38;5;66;03m# raised during streaming, so all calls with incorrect\u001b[39;00m\n\u001b[0;32m 834\u001b[0m \u001b[38;5;66;03m# Content-Length are caught.\u001b[39;00m\n\u001b[1;32m--> 835\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m IncompleteRead(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp_bytes_read, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength_remaining)\n\u001b[0;32m 837\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data:\n", + "\u001b[1;31mIncompleteRead\u001b[0m: IncompleteRead(147938459 bytes read, 130273836 more expected)", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[1;31mProtocolError\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\requests\\models.py:820\u001b[0m, in \u001b[0;36mResponse.iter_content..generate\u001b[1;34m()\u001b[0m\n\u001b[0;32m 819\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 820\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw\u001b[38;5;241m.\u001b[39mstream(chunk_size, decode_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\urllib3\\response.py:936\u001b[0m, in \u001b[0;36mHTTPResponse.stream\u001b[1;34m(self, amt, decode_content)\u001b[0m\n\u001b[0;32m 935\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_fp_closed(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m--> 936\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 938\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data:\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\urllib3\\response.py:907\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[1;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[0;32m 903\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m<\u001b[39m amt \u001b[38;5;129;01mand\u001b[39;00m data:\n\u001b[0;32m 904\u001b[0m \u001b[38;5;66;03m# TODO make sure to initially read enough data to get past the headers\u001b[39;00m\n\u001b[0;32m 905\u001b[0m \u001b[38;5;66;03m# For example, the GZ file header takes 10 bytes, we don't want to read\u001b[39;00m\n\u001b[0;32m 906\u001b[0m \u001b[38;5;66;03m# it one byte at a time\u001b[39;00m\n\u001b[1;32m--> 907\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raw_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 908\u001b[0m decoded_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decode(data, decode_content, flush_decoder)\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\urllib3\\response.py:813\u001b[0m, in \u001b[0;36mHTTPResponse._raw_read\u001b[1;34m(self, amt)\u001b[0m\n\u001b[0;32m 811\u001b[0m fp_closed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclosed\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m--> 813\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_error_catcher\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 814\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfp_closed\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43mb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n", + "File \u001b[1;32mc:\\Python311\\Lib\\contextlib.py:155\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[1;34m(self, typ, value, traceback)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 155\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgen\u001b[38;5;241m.\u001b[39mthrow(typ, value, traceback)\n\u001b[0;32m 156\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m 157\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[0;32m 158\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\urllib3\\response.py:727\u001b[0m, in \u001b[0;36mHTTPResponse._error_catcher\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 725\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (HTTPException, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 726\u001b[0m \u001b[38;5;66;03m# This includes IncompleteRead.\u001b[39;00m\n\u001b[1;32m--> 727\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ProtocolError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mConnection broken: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, e) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m 729\u001b[0m \u001b[38;5;66;03m# If no exception is thrown, we should avoid cleaning up\u001b[39;00m\n\u001b[0;32m 730\u001b[0m \u001b[38;5;66;03m# unnecessarily.\u001b[39;00m\n", + "\u001b[1;31mProtocolError\u001b[0m: ('Connection broken: IncompleteRead(147938459 bytes read, 130273836 more expected)', IncompleteRead(147938459 bytes read, 130273836 more expected))", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mChunkedEncodingError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[11], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Load the sentence transformer model for embeddings\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mSentenceTransformer\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mAlibaba-NLP/gte-base-en-v1.5\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\sentence_transformers\\SentenceTransformer.py:87\u001b[0m, in \u001b[0;36mSentenceTransformer.__init__\u001b[1;34m(self, model_name_or_path, modules, device, cache_folder, use_auth_token)\u001b[0m\n\u001b[0;32m 83\u001b[0m model_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(cache_folder, model_name_or_path\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[0;32m 85\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(model_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodules.json\u001b[39m\u001b[38;5;124m'\u001b[39m)):\n\u001b[0;32m 86\u001b[0m \u001b[38;5;66;03m# Download from hub with caching\u001b[39;00m\n\u001b[1;32m---> 87\u001b[0m \u001b[43msnapshot_download\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 88\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_folder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 89\u001b[0m \u001b[43m \u001b[49m\u001b[43mlibrary_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msentence-transformers\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 90\u001b[0m \u001b[43m \u001b[49m\u001b[43mlibrary_version\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m__version__\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 91\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mflax_model.msgpack\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrust_model.ot\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtf_model.h5\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 92\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 94\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(model_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodules.json\u001b[39m\u001b[38;5;124m'\u001b[39m)): \u001b[38;5;66;03m#Load as SentenceTransformer model\u001b[39;00m\n\u001b[0;32m 95\u001b[0m modules \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_sbert_model(model_path)\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\sentence_transformers\\util.py:491\u001b[0m, in \u001b[0;36msnapshot_download\u001b[1;34m(repo_id, revision, cache_dir, library_name, library_version, user_agent, ignore_files, use_auth_token)\u001b[0m\n\u001b[0;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m version\u001b[38;5;241m.\u001b[39mparse(huggingface_hub\u001b[38;5;241m.\u001b[39m__version__) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m version\u001b[38;5;241m.\u001b[39mparse(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0.8.1\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m 487\u001b[0m \u001b[38;5;66;03m# huggingface_hub v0.8.1 introduces a new cache layout. We sill use a manual layout\u001b[39;00m\n\u001b[0;32m 488\u001b[0m \u001b[38;5;66;03m# And need to pass legacy_cache_layout=True to avoid that a warning will be printed\u001b[39;00m\n\u001b[0;32m 489\u001b[0m cached_download_args[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlegacy_cache_layout\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m--> 491\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[43mcached_download\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcached_download_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(path \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.lock\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m 494\u001b[0m os\u001b[38;5;241m.\u001b[39mremove(path \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.lock\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\huggingface_hub\\utils\\_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.._inner_fn\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[0;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[1;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\huggingface_hub\\utils\\_deprecation.py:132\u001b[0m, in \u001b[0;36m_deprecate_method.._inner_deprecate_method..inner_f\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 130\u001b[0m warning_message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m message\n\u001b[0;32m 131\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(warning_message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m)\n\u001b[1;32m--> 132\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\huggingface_hub\\file_download.py:807\u001b[0m, in \u001b[0;36mcached_download\u001b[1;34m(url, library_name, library_version, cache_dir, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout)\u001b[0m\n\u001b[0;32m 804\u001b[0m cache_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124m?\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mabspath(cache_path)\n\u001b[0;32m 806\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m WeakFileLock(lock_path):\n\u001b[1;32m--> 807\u001b[0m \u001b[43m_download_to_tmp_and_move\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 808\u001b[0m \u001b[43m \u001b[49m\u001b[43mincomplete_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcache_path\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m.incomplete\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 809\u001b[0m \u001b[43m \u001b[49m\u001b[43mdestination_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcache_path\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 810\u001b[0m \u001b[43m \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 811\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 812\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 813\u001b[0m \u001b[43m \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 814\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 815\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 816\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 818\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m force_filename \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 819\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcreating metadata file for \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, cache_path)\n", + "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\huggingface_hub\\file_download.py:1915\u001b[0m, in \u001b[0;36m_download_to_tmp_and_move\u001b[1;34m(incomplete_path, destination_path, url_to_download, proxies, headers, expected_size, filename, force_download)\u001b[0m\n\u001b[0;32m 1912\u001b[0m _check_disk_space(expected_size, incomplete_path\u001b[38;5;241m.\u001b[39mparent)\n\u001b[0;32m 1913\u001b[0m _check_disk_space(expected_size, destination_path\u001b[38;5;241m.\u001b[39mparent)\n\u001b[1;32m-> 1915\u001b[0m \u001b[43mhttp_get\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1916\u001b[0m \u001b[43m \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1917\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1918\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1919\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1920\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1921\u001b[0m \u001b[43m \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1922\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1924\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDownload complete. Moving file to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdestination_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1925\u001b[0m _chmod_and_move(incomplete_path, destination_path)\n", + "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\huggingface_hub\\file_download.py:549\u001b[0m, in \u001b[0;36mhttp_get\u001b[1;34m(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)\u001b[0m\n\u001b[0;32m 547\u001b[0m new_resume_size \u001b[38;5;241m=\u001b[39m resume_size\n\u001b[0;32m 548\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 549\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m r\u001b[38;5;241m.\u001b[39miter_content(chunk_size\u001b[38;5;241m=\u001b[39mDOWNLOAD_CHUNK_SIZE):\n\u001b[0;32m 550\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunk: \u001b[38;5;66;03m# filter out keep-alive new chunks\u001b[39;00m\n\u001b[0;32m 551\u001b[0m progress\u001b[38;5;241m.\u001b[39mupdate(\u001b[38;5;28mlen\u001b[39m(chunk))\n", + "File \u001b[1;32mc:\\Python311\\Lib\\site-packages\\requests\\models.py:822\u001b[0m, in \u001b[0;36mResponse.iter_content..generate\u001b[1;34m()\u001b[0m\n\u001b[0;32m 820\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw\u001b[38;5;241m.\u001b[39mstream(chunk_size, decode_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m--> 822\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ChunkedEncodingError(e)\n\u001b[0;32m 823\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m DecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 824\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ContentDecodingError(e)\n", + "\u001b[1;31mChunkedEncodingError\u001b[0m: ('Connection broken: IncompleteRead(147938459 bytes read, 130273836 more expected)', IncompleteRead(147938459 bytes read, 130273836 more expected))" + ] + } + ], + "source": [ + "# Load the sentence transformer model for embeddings\n", + "model = SentenceTransformer(\"Alibaba-NLP/gte-base-en-v1.5\")" + ] }, { "cell_type": "markdown", @@ -201,7 +359,10 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def cosine_similarity(a, b):\n", + " return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))" + ] }, { "cell_type": "markdown",