Skip to content

Commit

Permalink
update llama_index
Browse files Browse the repository at this point in the history
  • Loading branch information
qingzhong1 committed Jan 24, 2024
1 parent 686b496 commit 3a2d529
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ WeasyPrint==52.5
openai>1.0
langchain_openai
zhon
llama_index
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import json
import os
from typing import Any, List
from typing import Any, Callable, List

import faiss
import jsonlines
import spacy
from langchain.docstore.document import Document
from langchain.text_splitter import SpacyTextSplitter
from langchain.vectorstores import FAISS
Expand Down Expand Up @@ -31,6 +32,19 @@
"""


def split_by_sentence_tokenizer(
pipeline="zh_core_web_sm", max_length: int = 1_000_000
) -> Callable[[str], List[str]]:
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
sentencizer.max_length = max_length

def split(text: str) -> List[str]:
sentences = (s.text for s in sentencizer(text).sents)
return [item for item in sentences]

return split


class GenerateAbstract:
def __init__(self, llm, chunk_size: int = 1500, chunk_overlap=0, path="./abstract.json"):
self.chunk_size = chunk_size
Expand Down Expand Up @@ -201,7 +215,9 @@ def build_index_llama(index_name, embeddings, path=None, url_path=None, abstract
return index
if not abstract and not origin_data:
documents = preprocess(path, url_path=url_path, use_langchain=False)
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
text_splitter = SentenceSplitter(
chunking_tokenizer_fn=split_by_sentence_tokenizer(), chunk_size=1024, chunk_overlap=0
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embeddings, text_splitter=text_splitter)
index = VectorStoreIndex.from_documents(
Expand All @@ -214,7 +230,9 @@ def build_index_llama(index_name, embeddings, path=None, url_path=None, abstract
return index
elif abstract:
nodes = get_abstract_data(path, use_langchain=False)
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
text_splitter = SentenceSplitter(
chunking_tokenizer_fn=split_by_sentence_tokenizer(), chunk_size=1024, chunk_overlap=0
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embeddings, text_splitter=text_splitter)
index = VectorStoreIndex(
Expand All @@ -227,7 +245,9 @@ def build_index_llama(index_name, embeddings, path=None, url_path=None, abstract
return index
elif origin_data:
nodes = [TextNode(text=item.page_content, metadata=item.metadata) for item in origin_data]
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
text_splitter = SentenceSplitter(
chunking_tokenizer_fn=split_by_sentence_tokenizer(), chunk_size=1024, chunk_overlap=0
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embeddings, text_splitter=text_splitter)
index = VectorStoreIndex(
Expand Down

0 comments on commit 3a2d529

Please sign in to comment.