-
Notifications
You must be signed in to change notification settings - Fork 0
/
lesson_3_different_file_types.py
189 lines (162 loc) · 6.94 KB
/
lesson_3_different_file_types.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import json
import glob
from pathlib import Path
from getpass import getpass
import gdown # download from google drive.
from haystack.components.writers import DocumentWriter
from haystack.components.converters import (
MarkdownToDocument,
PyPDFToDocument,
TextFileToDocument,
)
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import HuggingFaceAPIGenerator
# In this tutorial, we
# 1. Convert documents of different mime-types into a single, clean (no whitespace) Haystack document.
# 2. We split this document into chunks and embed those chunks in to a document store.
# 3. Ask questions of the document.
#
# In lesson 1, the data was pre-processed so we didn't use a DocumentCleaner() or DocumentSplitter().
try:
url = "https://drive.google.com/drive/folders/1n9yqq5Gl_HWfND5bTlrCwAOycMDt5EMj"
output_dir = "dataset/recipe_files"
print("Downloading files")
gdown.download_folder(url, quiet=True, output=output_dir)
document_store = InMemoryDocumentStore()
# Routes documents of certain mime types to the appropriate converter.
file_type_router = FileTypeRouter(
mime_types=["text/plain", "application/pdf", "text/markdown"]
)
# TextFileToDocument converts TextFiles to Haystack documents.
text_file_converter = TextFileToDocument()
# MarkdownToDocument converts Markdown Documents to Haystack documents
markdown_converter = MarkdownToDocument()
# PyPDFToDocument converts PDF documents to Haystack documents
pdf_converter = PyPDFToDocument()
# Merge the documents into a single list of documents
document_joiner = DocumentJoiner()
# Removes excessive whitespace from the documents
document_cleaner = DocumentCleaner()
# Splits the documents into chunks of 150 words. There is a bit of overlap to maintain context between chunks.
document_splitter = DocumentSplitter(
split_by="word", split_length=150, split_overlap=50
)
document_embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
document_writer = DocumentWriter(document_store)
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(
instance=file_type_router, name="file_type_router"
)
preprocessing_pipeline.add_component(
instance=text_file_converter, name="text_file_converter"
)
preprocessing_pipeline.add_component(
instance=markdown_converter, name="markdown_converter"
)
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(
instance=document_joiner, name="document_joiner"
)
preprocessing_pipeline.add_component(
instance=document_cleaner, name="document_cleaner"
)
preprocessing_pipeline.add_component(
instance=document_splitter, name="document_splitter"
)
preprocessing_pipeline.add_component(
instance=document_embedder, name="document_embedder"
)
preprocessing_pipeline.add_component(
instance=document_writer, name="document_writer"
)
# Route documents with mime-type text/plain to text_file_converter.
preprocessing_pipeline.connect(
"file_type_router.text/plain", "text_file_converter.sources"
)
# Route documents with mime-type application/pdf to pypdf_converter.
preprocessing_pipeline.connect(
"file_type_router.application/pdf", "pypdf_converter.sources"
)
# Route documents with mime-type text/markdown to markdown_converter.
preprocessing_pipeline.connect(
"file_type_router.text/markdown", "markdown_converter.sources"
)
# Pass the haystack documents from file converters to the document joiner.
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("markdown_converter", "document_joiner")
# Clean the list of haystack documents returned from the document joiner
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
# Split the documents into chunks
preprocessing_pipeline.connect("document_cleaner", "document_splitter")
# Embed the chunks as vectore
preprocessing_pipeline.connect("document_splitter", "document_embedder")
# Write the vectors the database.
preprocessing_pipeline.connect("document_embedder", "document_writer")
print("Processing Documents")
preprocessing_pipeline.run(
{"file_type_router": {"sources": list(Path(output_dir).glob("**/*"))}}
)
# BEGIN QUERYING THE VECTOR DOCUMENT STORE
if "HF_API_TOKEN" not in os.environ:
os.environ["HF_API_TOKEN"] = getpass("Enter Hugging Face token:")
template = """
Answer the questions based on the given context.
Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}
Question: {{ question }}
Answer:
"""
pipe = Pipeline()
pipe.add_component(
"embedder",
SentenceTransformersTextEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
),
)
pipe.add_component(
"retriever", InMemoryEmbeddingRetriever(document_store=document_store)
)
pipe.add_component("prompt_builder", PromptBuilder(template=template))
pipe.add_component(
"llm",
HuggingFaceAPIGenerator(
api_type="serverless_inference_api",
api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
),
)
# Connect the embedder to the document embeddings
pipe.connect("embedder.embedding", "retriever.query_embedding")
# Connect the retriever to the prompt embedding
pipe.connect("retriever", "prompt_builder.documents")
# Connect prompt builder to an llm (that it can ask the prompt to)
pipe.connect("prompt_builder", "llm")
question = "Which of the following recipes would be met with the most disapproval from Ron Swanson? Explain why in Ron Swanson's style of speech"
print(f"\n\nAsking the Question `{question}`")
response = pipe.run(
{
"embedder": {"text": question},
"prompt_builder": {"question": question},
"llm": {"generation_kwargs": {"max_new_tokens": 350}},
}
)
reply = response["llm"]["replies"][0]
print(f"The answer is '{reply}'")
except Exception as e:
raise e
finally:
# Can we delete the dataset or something?
pass