-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc_preprocess.py
44 lines (28 loc) · 1.21 KB
/
doc_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from pdf_loader import Documents
import argparse
import os
import tqdm
import pickle
import doc_preprocess_config
import json
def main(config):
if not os.path.exists(config.output_path):
os.makedirs(config.output_path)
with open(config.output_path + 'config.json', 'w') as fw:
json.dump(config.to_dict(), fw, indent=2)
paper_list = os.listdir(config.papers_path)
retrieved_docs = {}
for paper in tqdm.tqdm(paper_list, total=len(paper_list)):
documents = Documents(pdf_directory=os.path.join(config.papers_path, paper))
retriever = documents.init_retriever()
retrieved_docs[paper] = retriever.invoke(config.query)
documents.db.delete_collection()
with open(config.output_path + 'processed_docs.pkl', 'wb') as fw:
pickle.dump(retrieved_docs, fw)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--process_id', required=True, type=str)
parser.add_argument('--papers_path', required=True, type=str)
parser.add_argument('--prompt_file', default='prompts.json', type=str)
parser.add_argument('--output_path', required=True, type=str)
main(doc_preprocess_config.get_config(parser.parse_args()))