From b654905eaba8cacf52cc9a63a07ba807781dc80d Mon Sep 17 00:00:00 2001 From: Aaron Jean Date: Wed, 26 Jun 2024 15:48:31 +0530 Subject: [PATCH] Changes made for the working of the pani-genie chatbot --- ayushma/utils/converse.py | 2 +- ayushma/utils/openaiapi.py | 6 +- ayushma/utils/upsert.py | 150 +++++++++++++++++++++++++++++++------ 3 files changed, 131 insertions(+), 27 deletions(-) diff --git a/ayushma/utils/converse.py b/ayushma/utils/converse.py index aa32558..a92595b 100644 --- a/ayushma/utils/converse.py +++ b/ayushma/utils/converse.py @@ -65,7 +65,7 @@ def converse_api( ) project = chat.project - top_k = request.data.get("top_k") or 100 + top_k = request.data.get("top_k") or 3 temperature = request.data.get("temperature") or 0.1 stream = request.data.get("stream") generate_audio = request.data.get("generate_audio") diff --git a/ayushma/utils/openaiapi.py b/ayushma/utils/openaiapi.py index c82da95..863f8d0 100644 --- a/ayushma/utils/openaiapi.py +++ b/ayushma/utils/openaiapi.py @@ -88,11 +88,13 @@ def get_sanitized_reference(pinecone_references: List[QueryResponse]) -> str: for match in reference.matches: try: document_id = str(match.metadata["document"]) + author_name = str(match.metadata["author_name"]) + url=str(match.metadata["url"]) text = str(match.metadata["text"]).replace("\n", " ") + "," if document_id in sanitized_reference: - sanitized_reference[document_id] += text + sanitized_reference[document_id] += text + url else: - sanitized_reference[document_id] = text + sanitized_reference[document_id] = text + url except Exception as e: print(f"Error extracting reference: {e}") pass diff --git a/ayushma/utils/upsert.py b/ayushma/utils/upsert.py index 13727e1..a2534b1 100644 --- a/ayushma/utils/upsert.py +++ b/ayushma/utils/upsert.py @@ -1,16 +1,75 @@ import os from io import BytesIO -from typing import Optional - +from typing import Optional, List import pinecone import requests from bs4 import BeautifulSoup from django.conf import settings -from PyPDF2 import PdfReader - +import csv from ayushma.utils.openaiapi import get_embedding - - +import time +def read_document(url): + filename = os.path.basename(url.split("?")[0]) + if filename.endswith(".pdf"): # Handle pdf files + print("PDF file detected") + response = requests.get(url) + print(response) + pdf_reader = PdfReader(BytesIO(response.content)) + text = "" + for i in range(len(pdf_reader.pages)): + page = pdf_reader.pages[i] + text += page.extract_text() + document_text = text + else: # Handle txt and md files + if url.startswith("http"): + response = requests.get(url) + document_text = response.text + else: + with open(os.path.join(settings.MEDIA_ROOT, "documents", url), "r") as f: + document_text = f.read() + return document_text +def read_columns_from_csv(file_path, column0_header,column1_header,column2_header,column3_header,column4_header,column5_header): + """ + Reads a specified column from a CSV file into a list. + :param file_path: Path to the CSV file + :param column_header: Header of the column to read + :return: List of values from the specified column + """ + column0_data = [] + column1_data = [] + column2_data = [] + column3_data = [] + column4_data = [] + column5_data = [] + with open(file_path, mode='r', encoding='utf-8-sig') as csvfile: + csvreader = csv.DictReader(csvfile) + for row in csvreader: + column0_data.append(row[column0_header]) + column1_data.append(row[column1_header]) + column2_data.append(row[column2_header]) + column3_data.append(row[column3_header]) + column4_data.append(row[column4_header]) + column5_data.append(row[column5_header]) + return column0_data,column1_data, column2_data,column3_data,column4_data,column5_data +# Example usage: +# csv_file_path = 'your_file_path.csv' +# column_name = 'your_column_name' +# column_values = read_column_from_csv(csv_file_path, column_name) +# print(column_values) +csv_file_path = 'ayushma/utils/test_upsert.csv' # Replace with your actual CSV file name +column0_name = 'id' +column1_name = 'title' +column2_name = 'author'# Replace with your actual column header name +column3_name = 'context' +column4_name = 'topic' +column5_name = 'url' +id_values,title_values,author_values,context_values,topic_values,url_values= read_columns_from_csv(csv_file_path, column0_name,column1_name, column2_name, column3_name,column4_name,column5_name) +print(len(id_values)) +print(len(title_values)) +print(len(author_values)) +print(len(context_values)) +print(len(topic_values)) +print(len(url_values)) def read_document(url): filename = os.path.basename(url.split("?")[0]) if filename.endswith(".pdf"): # Handle pdf files @@ -30,10 +89,7 @@ def read_document(url): else: with open(os.path.join(settings.MEDIA_ROOT, "documents", url), "r") as f: document_text = f.read() - return document_text - - def upsert( external_id: str, document_id: int, @@ -43,17 +99,14 @@ def upsert( ): """ Upserts the contents of a file, URL, or text to a Pinecone index with the specified external ID. - Args: external_id (str): The external ID to use when upserting to the Pinecone index. document_id (int): The external_id of the document that is to be upserted (external_id of the doc is added to the metadata) s3_url (str, optional): The S3 URL of the file to upsert. Defaults to None. url (str, optional): The URL of the website to upsert. Defaults to None. text (str, optional): The text content to upsert. Defaults to None. - Raises: Exception: If none of s3_url, url, or text is provided. - Returns: None """ @@ -62,11 +115,8 @@ def upsert( environment=settings.PINECONE_ENVIRONMENT, ) print("Initialized Pinecone and OpenAI") - print("Processing...") - document_lines = [] - if s3_url: document_text = read_document(s3_url) document_lines = document_text.strip().splitlines() @@ -74,22 +124,18 @@ def upsert( html = requests.get(url).text soup = BeautifulSoup(html, "html.parser") document_lines = soup.get_text().strip().splitlines() - elif text: document_lines = text.strip().splitlines() else: raise Exception("Either filepath, url or text must be provided") - if len(document_lines) == 0: raise Exception( "[Upsert] No text found in the document. Please check the document." ) print(document_lines) - batch_size = ( 100 # process everything in batches of 100 (creates 100 vectors per upset) ) - print("Fetching Pinecone index...") if settings.PINECONE_INDEX not in pinecone.list_indexes(): pinecone.create_index( @@ -97,9 +143,7 @@ def upsert( dimension=1536, # 1536 is the dimension of the text-embedding-ada-002 model ) pinecone_index = pinecone.Index(index_name=settings.PINECONE_INDEX) - print("Upserting to Pinecone index...") - for i in range(0, len(document_lines), batch_size): i_end = min(i + batch_size, len(document_lines)) # set end position of batch lines_batch = document_lines[i : i + batch_size] # get batch of lines and IDs @@ -107,14 +151,72 @@ def upsert( line.strip() for line in lines_batch if line.strip() ] # remove blank lines ids_batch = [f"{document_id}_{n}" for n in range(i, i_end)] # create IDs - embeds = get_embedding(lines_batch) # create embeddings meta = [ {"text": line, "document": str(document_id)} for line in lines_batch ] # prep metadata and upsert batch to_upsert = zip(ids_batch, embeds, meta) # zip together pinecone_index.upsert( - vectors=list(to_upsert), namespace=str(external_id) + vectors=list(to_upsert), namespace="all_documents" ) # upsert to Pinecone - print("Finished upserting to Pinecone index") +def upsert_base(external_id: str, id_1: List[str], title_1: List[str], author_values_1: List[str], context_values_1: List[str], topic_values_1: List[str], url_values_1: List[str]): + """ + Upserts contents to a Pinecone index using document IDs, text content, and topics. + Args: + external_id (str): The external ID to use when upserting to the Pinecone index. + column1_data (List[str]): List of document IDs. + column2_data (List[str]): List of document text contents. + column3_data (List[str]): List of topics corresponding to each document. + Returns: + None + """ + pinecone.init( api_key=settings.PINECONE_API_KEY, environment=settings.PINECONE_ENVIRONMENT ) + print("Initialized Pinecone and OpenAI") + print("Fetching Pinecone index...") + if settings.PINECONE_INDEX not in pinecone.list_indexes(): + pinecone.create_index( + settings.PINECONE_INDEX, + dimension=1536, # 1536 is the dimension of the text-embedding-ada-002 model + ) + pinecone_index = pinecone.Index(index_name=settings.PINECONE_INDEX) + # pc = Pinecone(api_key=settings.PINECONE_API_KEY, environment=settings.PINECONE_ENVIRONMENT) + # if 'default' not in pc.list_indexes().names(): + # pc.create_index( + # name='default', + # dimension=1536, + # metric='cosine', + # spec=PodSpec( + # environment=settings.PINECONE_ENVIRONMENT + # ) + # ) + print("Initialized Pinecone") +# Assume column1_data contains document IDs (subjects), column2_data contains document texts (contexts), +# and column3_data contains the corresponding topics for each document. +# ... [previous code for setup] ... + batch_size = 50 + print("Upserting to Pinecone index...") + # Ensure that all arrays have the same length before proceeding + assert len(id_1) == len(title_1) == len(author_values_1) == len(context_values_1) == len(topic_values_1) == len(url_values_1), "Length mismatch among columns." + for i in range(0, len(context_values_1), batch_size): + i_end = min(i + batch_size, len(context_values_1)) + # Extract the batch for subjects (IDs), contexts (contents), and topics + ids_batch = id_1[i:i_end] + titles_batch = title_1[i:i_end] + authors_batch = author_values_1 + contexts_batch = context_values_1[i:i_end] + topics_batch = topic_values_1[i:i_end] + urls_batch = url_values_1[i:i_end] + # Create embeddings for the batch of contexts + embeds = get_embedding(contexts_batch)#embed_documents + # Prepare metadata for the batch, including subject, context, and topic + meta = [{"document": title, "author_name":author, "text" : context, "topic": topic,"url":url} + for title,author,context, topic, url in zip(titles_batch,authors_batch, contexts_batch,topics_batch,urls_batch)] + # Prepare the data for upserting + to_upsert = [(id_, embed, meta_) for id_, embed, meta_ in zip(ids_batch, embeds, meta)] + pinecone_index.upsert(vectors=to_upsert, namespace="all_documents") + time.sleep(200) + print("Finished upserting to Pinecone index") +external_id= '28c45a92-9f4a-4d68-bd68-bb7290bcfeef' +#upsert_base(external_id, id_values,title_values,author_values,context_values,topic_values,url_values) +#above is the code to execute the upsert of the test_upsert.csv file to the pinecone database, once upsert done comment it and run application again \ No newline at end of file