diff --git a/workflows/chatbot/indexing/doc_index.py b/workflows/chatbot/indexing/doc_index.py new file mode 100644 index 00000000000..bd0442ee4b0 --- /dev/null +++ b/workflows/chatbot/indexing/doc_index.py @@ -0,0 +1,284 @@ +import os +import argparse +from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings +from langchain.vectorstores import Chroma +from chromadb.utils import embedding_functions +from langchain.docstore.document import Document +import re, json +from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter +from langchain.document_loaders import TextLoader, UnstructuredMarkdownLoader +import PyPDF2 +from haystack.schema import Document as SDocument +from docx import Document as DDocument +from haystack.document_stores import ElasticsearchDocumentStore, InMemoryDocumentStore +import pandas as pd + + +def split_paragraph(text, jsonl_name, max_length=378): + new_sens = [] + documents = [] + for sub in text: + sub['doc'].replace('#', " ") + sub['doc'] = re.sub(r'\s+', ' ', sub['doc']) + new_doc = Document(page_content=sub['doc'], metadata={"source": sub['doc_id']}) + documents.append(new_doc) + return documents + + +## indexing for jsonl file +def d_load_jsonl_file(file_path, process, max_length=378): + data = [] + with open(file_path, 'r') as file: + for line in file: + json_obj = json.loads(line) + data.append(json_obj) + + new_sens = [] + documents = [] + paragraphs = [] + for sub in data: + sub['doc'].replace('#', " ") + if not process: + sub['doc'] = re.sub(r'\s+', ' ', sub['doc']) + new_doc = Document(page_content=sub['doc'], metadata={"source": sub['doc_id']}) + documents.append(new_doc) + else: + for sub in data: + sub['doc'].replace('#', " ") + split_sen = re.split(r'[.?!]', sub['doc']) + for num in range(len(split_sen)): + split_sen[num] = re.sub(r'\s+', ' ', split_sen[num]) + if num+1 < len(split_sen): + if len(split_sen[num])>max_length: + new_sens.append(split_sen[num].strip()) + else: + split_sen[num+1]=split_sen[num]+split_sen[num+1] + else: + new_sens.append(split_sen[num]) + + print("length for origin", len(new_sens)) + paragraphs = list(set(new_sens)) + print("length for processed", len(new_sens)) + documents = [] + metadata = {"source": file_path} + for paragraph in paragraphs: + new_doc = Document(page_content=paragraph, metadata=metadata) + documents.append(new_doc) + return documents + + +# def d_load_xlsx_file(file_path, process, max_length=378): +# data = [] +# data = pd.read_excel(file_path) +# +# new_sens = [] +# documents = [] +# paragraphs = [] +# for sub in data: +# sub['doc'].replace('#', " ") +# if not process: +# sub['doc'] = re.sub(r'\s+', ' ', sub['doc']) +# new_doc = Document(page_content=sub['doc'], metadata={"source": sub['doc_id']}) +# documents.append(new_doc) + +## indexing for pdf file +def d_load_file(file_path, process, max_length=378): + if file_path.endswith("pdf"): + text = load_pdf(file_path) + elif file_path.endswith("docx"): + text = read_docx(file_path) + + text = text.replace('\n', '') + text = text.replace('\n\n', '') + text = re.sub(r'\s+', ' ', text) + """ + split the document + """ + sentences = re.split('(?<=[;!.?])', text) + + new_sents = [] + for i in range(int(len(sentences) / 2)): + sent = sentences[2 * i] + sentences[2 * i + 1] + new_sents.append(sent) + if len(sentences) % 2 == 1: + new_sents.append(sentences[len(sentences) - 1]) + + paragraphs = [] + current_length = 0 + current_paragraph = "" + for sentence in new_sents: + sentence_length = len(sentence) + if current_length + sentence_length <= max_length: + current_paragraph += sentence + current_length += sentence_length + else: + paragraphs.append(current_paragraph.strip()) + current_paragraph = sentence + current_length = sentence_length + print("length for origin", len(paragraphs)) + paragraphs.append(current_paragraph.strip()) + paragraphs = list(set(paragraphs)) + print("length for processed", len(paragraphs)) + documents = [] + metadata = {"source": file_path} + for paragraph in paragraphs: + new_doc = Document(page_content=paragraph, metadata=metadata) + documents.append(new_doc) + return documents + + +### Load with spare embedding for jsonl file +def s_load_jsonl_file(file_path, process, document_store, max_length=378): + data = [] + with open(file_path, 'r') as file: + for line in file: + json_obj = json.loads(line) + data.append(json_obj) + + new_sens = [] + documents = [] + paragraphs = [] + for sub in data: + sub['doc'].replace('#', " ") + if not process: + sub['doc'] = re.sub(r'\s+', ' ', sub['doc']) + new_doc = SDocument(content=sub['doc'], meta={"source": sub['doc_id']}) + documents.append(new_doc) + else: + for sub in data: + sub['doc'].replace('#', " ") + split_sen = re.split(r'[.?!]', sub['doc']) + for num in range(len(split_sen)): + split_sen[num] = re.sub(r'\s+', ' ', split_sen[num]) + if num+1 < len(split_sen): + if len(split_sen[num])>max_length: + new_sens.append(split_sen[num].strip()) + else: + split_sen[num+1]=split_sen[num]+split_sen[num+1] + else: + new_sens.append(split_sen[num]) + + print("length for origin", len(new_sens)) + paragraphs = list(set(new_sens)) + print("length for processed", len(new_sens)) + documents = [] + metadata = {"source": file_path} + for paragraph in paragraphs: + new_doc = SDocument(content=paragraph, meta=metadata) + documents.append(new_doc) + document_store.write_documents(documents) + return document_store + + +### Load with spare embedding for pdf file +def s_load_file(file_path, process, document_store, max_length=378): + if file_path.endswith("pdf"): + text = load_pdf(file_path) + elif file_path.endswith("docx"): + text = read_docx(file_path) + + text = text.replace('\n', '') + text = text.replace('\n\n', '') + text = re.sub(r'\s+', ' ', text) + """ + split the document + """ + sentences = re.split('(?<=[;!.?])', text) + + new_sents = [] + for i in range(int(len(sentences) / 2)): + sent = sentences[2 * i] + sentences[2 * i + 1] + new_sents.append(sent.strip()) + if len(sentences) % 2 == 1: + new_sents.append(sentences[len(sentences) - 1]) + + paragraphs = [] + current_length = 0 + current_paragraph = "" + for sentence in new_sents: + sentence_length = len(sentence) + if current_length + sentence_length <= max_length: + current_paragraph += sentence + current_length += sentence_length + else: + paragraphs.append(current_paragraph.strip()) + current_paragraph = sentence + current_length = sentence_length + print("length for origin", len(paragraphs)) + paragraphs.append(current_paragraph.strip()) + paragraphs = list(set(paragraphs)) + print("length for processed", len(paragraphs)) + documents = [] + metadata = {"source": file_path} + for paragraph in paragraphs: + new_doc = SDocument(content=paragraph, metadata=metadata) + documents.append(new_doc) + document_store.write_documents(documents) + + return document_store + + +def persist_embedding(documents, persist_directory, model_path): + ## persistly save the local file into disc + embedding = HuggingFaceInstructEmbeddings(model_name=model_path) + vectordb = Chroma.from_documents(documents=documents, embedding=embedding, persist_directory=persist_directory) + vectordb.persist() + vectordb = None + + +def read_docx(doc_path): + doc = DDocument(doc_path) + text = '' + for paragraph in doc.paragraphs: + text += paragraph.text + return text + +def load_pdf(pdf_path): + pdf_file = open(pdf_path, 'rb') + pdf_reader = PyPDF2.PdfReader(pdf_file) + + text = '' + for num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[num] + text += page.extract_text() + return text + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--file_path', type=str, help='The user upload file.', + default="/data1/lkk/llm_inference/chat-langchain/inc_documents_formated.jsonl") + parser.add_argument('--process', type=bool, + help='Whether or not to proceed the load content.', + default=False) + parser.add_argument('--embedding_model', type=str, help='Select which model to embed the content.', default='/data1/lkk/instructor_large/') + parser.add_argument('--output_path', type=str, help='Where to save the embedding.', default='db_jsonl122') + parser.add_argument('--embedding_method', type=str, help='Select to use dense retrieval or sparse retrieval.', default='dense') + parser.add_argument('--store', type=str, help='Select to use dense retrieval or sparse retrieval.', + default='dense') + + args = parser.parse_args() + + if args.embedding_method == "dense": # currently use Chroma as the dense retrieval datastore + if args.file_path.endswith("jsonl"): + documents = d_load_jsonl_file(args.file_path, args.process) + elif args.file_path.endswith("pdf") or args.file_path.endswith("docx"): + documents = d_load_file(args.file_path, args.process) + else: + print("{} is ignored. Will support this file format soon.".format(args.file_path)) + persist_embedding(documents, args.output_path, args.embedding_model) + elif args.embedding_method == "sparse": # sparse retrieval datastores has inmemory and Elasticsearch + if args.store == "inmemory": + document_store = InMemoryDocumentStore(use_gpu=False, use_bm25=True) + elif args.store == "Elasticsearch": + document_store = ElasticsearchDocumentStore(host="localhost", index="elastic_index_1", + port=9200, search_fields=["content", "title"]) + # import pdb;pdb.set_trace() + if args.file_path.endswith("jsonl"): + document_store = s_load_jsonl_file(args.file_path, args.process, document_store) + elif args.file_path.endswith("pdf") or args.file_path.endswith("docx"): + document_store = s_load_file(args.file_path, args.process, document_store) + else: + print("{} is ignored. Will support this file format soon.".format(args.file_path)) + diff --git a/workflows/chatbot/indexing/readme.md b/workflows/chatbot/indexing/readme.md new file mode 100644 index 00000000000..4fb3f35d761 --- /dev/null +++ b/workflows/chatbot/indexing/readme.md @@ -0,0 +1,51 @@ +Document Indexing +====== +1. [Introduction](#introduction) +2. [Get Started](#get-started) + +## Introduction + +Document indexing serves the purpose of assisting users in parsing locally uploaded files and storing them in a document store for future content retrieval. We have designed two separate indexing methods: sparse retrieval and dense retrieval. + +Sparse Retrieval (SR) involves projecting the content into a sparse vector that closely aligns with the vocabulary of the content's language. This can be achieved through traditional Bag-of-Words techniques like TF-IDF or BM25. + +On the other hand, Dense Retrieval (DR) encodes the content as one or more dense vectors. Users have the option to specify a local pretrained model or utilize a GPT model from OpenAI to obtain the embeddings of the uploaded content. The choice between sparse retrieval and dense retrieval depends on the specific requirements of the individual. + +Our repository currently supports three document stores: `In Memory` and `Elasticsearch` for sparse retrieval, and `Chroma` for dense retrieval. Each document store has its own unique characteristics. The selection of a document store should be based on the maturity of your project, the intended use case, and the technical environment. + + +|Document store |Main features |Platform | +|:----------|:----------|:------------------| +|Elasticsearch |Sparse retrieval with many tuning options and basic support for dense retrieval. |Haystack| +|In Memory|Simple document store, with no extra services or dependencies. Not recommended for production. |Haystack | | +|Chroma |Focus on dense retrieval. Easy to use. Lightwieght and fast for retrieval. |LangChain| + +The support for other document stores will be available soon. + +Right now, we support the user to upload the file in the PDF format and jsonl format. After the indexing work, the user can easily edit the local document store to add or delete a specific file. + +## Get Started + +### Sparse Indexing + +When it comes to sparse indexing, the process of parsing a local file into the desired document store is straightforward for users. They simply need to provide the file path using the `--file_path` parameter and choose an appropriate local document store using the `--store parameter`. + +However, it's important to mention that the `In Memory` method does not support local database storage. It requires users to perform document processing every time they use it, as the documents are not persistently stored. + + ```bash +python doc_index.py --file_path "xxx" --output_path "xxx" --embedding_method sparse --store Elasticsearch + ``` + +### Dense Indexing +When it comes to dense indexing, users have the flexibility to choose their preferred pretrained encoder model for the process. In the given use case, we utilize the `instructor-large` model with the `HuggingFaceInstructEmbeddings` API. Users can select a suitable model from the [text embedding benchmark leaderboard](https://huggingface.co/spaces/mteb/leaderboard). We support both local models and models available through the HuggingFace library. To use a specific model, users can provide the model name. + +Alternatively, users can also utilize GPT models from OpenAI. To incorporate a GPT model into the process, minor adjustments need to be made to the following code: + ```python +from langchain.embeddings import OpenAIEmbeddings +embeddings = OpenAIEmbeddings() + ``` + +The user can start the dense indexing with, + ```bash +python doc_index.py --file_path "xxx" --output_path "xxx" --embedding_model hkunlp/instructor-large --embedding_method dense --store Chroma + ``` \ No newline at end of file diff --git a/workflows/chatbot/indexing/requirements.txt b/workflows/chatbot/indexing/requirements.txt new file mode 100644 index 00000000000..13209d27aaa --- /dev/null +++ b/workflows/chatbot/indexing/requirements.txt @@ -0,0 +1,4 @@ +langchain +chromadb +PyPDF2 +haystack \ No newline at end of file