1
- import os
1
+ import hashlib
2
2
import logging
3
+ import os
3
4
4
- import hashlib
5
5
import PyPDF2
6
+ from langchain_community .chat_models import ChatOpenAI
7
+ from langchain_community .embeddings .huggingface import HuggingFaceEmbeddings
8
+ from langchain_community .vectorstores import FAISS
9
+ from langchain_openai import OpenAIEmbeddings
6
10
from tqdm import tqdm
7
11
12
+ from modules .config import local_embedding
8
13
from modules .presets import *
9
14
from modules .utils import *
10
- from modules .config import local_embedding
11
15
12
16
13
17
def get_documents (file_src ):
@@ -28,8 +32,8 @@ def get_documents(file_src):
28
32
if file_type == ".pdf" :
29
33
logging .debug ("Loading PDF..." )
30
34
try :
31
- from modules .pdf_func import parse_pdf
32
35
from modules .config import advance_docs
36
+ from modules .pdf_func import parse_pdf
33
37
34
38
two_column = advance_docs ["pdf" ].get ("two_column" , False )
35
39
pdftext = parse_pdf (filepath , two_column ).text
@@ -43,12 +47,14 @@ def get_documents(file_src):
43
47
metadata = {"source" : filepath })]
44
48
elif file_type == ".docx" :
45
49
logging .debug ("Loading Word..." )
46
- from langchain .document_loaders import UnstructuredWordDocumentLoader
50
+ from langchain .document_loaders import \
51
+ UnstructuredWordDocumentLoader
47
52
loader = UnstructuredWordDocumentLoader (filepath )
48
53
texts = loader .load ()
49
54
elif file_type == ".pptx" :
50
55
logging .debug ("Loading PowerPoint..." )
51
- from langchain .document_loaders import UnstructuredPowerPointLoader
56
+ from langchain .document_loaders import \
57
+ UnstructuredPowerPointLoader
52
58
loader = UnstructuredPowerPointLoader (filepath )
53
59
texts = loader .load ()
54
60
elif file_type == ".epub" :
@@ -93,9 +99,6 @@ def construct_index(
93
99
separator = " " ,
94
100
load_from_cache_if_possible = True ,
95
101
):
96
- from langchain .chat_models import ChatOpenAI
97
- from langchain .vectorstores import FAISS
98
-
99
102
if api_key :
100
103
os .environ ["OPENAI_API_KEY" ] = api_key
101
104
else :
@@ -109,11 +112,9 @@ def construct_index(
109
112
index_name = get_file_hash (file_src )
110
113
index_path = f"./index/{ index_name } "
111
114
if local_embedding :
112
- from langchain .embeddings .huggingface import HuggingFaceEmbeddings
113
115
embeddings = HuggingFaceEmbeddings (
114
116
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2" )
115
117
else :
116
- from langchain .embeddings import OpenAIEmbeddings
117
118
if os .environ .get ("OPENAI_API_TYPE" , "openai" ) == "openai" :
118
119
embeddings = OpenAIEmbeddings (openai_api_base = os .environ .get (
119
120
"OPENAI_API_BASE" , None ), openai_api_key = os .environ .get ("OPENAI_EMBEDDING_API_KEY" , api_key ))
@@ -122,7 +123,7 @@ def construct_index(
122
123
model = os .environ ["AZURE_EMBEDDING_MODEL_NAME" ], openai_api_base = os .environ ["AZURE_OPENAI_API_BASE_URL" ], openai_api_type = "azure" )
123
124
if os .path .exists (index_path ) and load_from_cache_if_possible :
124
125
logging .info (i18n ("找到了缓存的索引文件,加载中……" ))
125
- return FAISS .load_local (index_path , embeddings )
126
+ return FAISS .load_local (index_path , embeddings , allow_dangerous_deserialization = True )
126
127
else :
127
128
documents = get_documents (file_src )
128
129
logging .debug (i18n ("构建索引中……" ))
0 commit comments