Skip to content

Commit 49b7dd9

Browse files
authored
Merge branch 'deepset-ai:main' into ci-deepset-ai#5931-isort
2 parents 682ddec + 21d894d commit 49b7dd9

File tree

130 files changed

+3887
-648
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

130 files changed

+3887
-648
lines changed

.github/workflows/examples_tests.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ jobs:
4242
python-version: ${{ env.PYTHON_VERSION }}
4343

4444
- name: Install Haystack
45-
run: pip install .[all,dev]
45+
run: |
46+
pip install --upgrade pip
47+
pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]
4648
4749
- name: Run
4850
run: pytest examples/

.github/workflows/linting.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ on:
66
paths:
77
- "**.py"
88
- "**/pyproject.toml"
9+
- "!haystack/preview/**/*.py"
10+
- "!test/preview/**/*.py"
11+
- "!e2e/preview/**/*.py"
912

1013
env:
1114
PYTHON_VERSION: "3.8"

.github/workflows/linting_preview.yml

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# If you change this name also do it in linting-skipper.yml and ci_metrics.yml
2+
name: Linting (Preview)
3+
4+
on:
5+
pull_request:
6+
paths:
7+
- "haystack/preview/**/*.py"
8+
- "test/preview/**/*.py"
9+
- "e2e/preview/**/*.py"
10+
- "**/pyproject.toml"
11+
12+
env:
13+
PYTHON_VERSION: "3.8"
14+
15+
jobs:
16+
mypy:
17+
runs-on: ubuntu-latest
18+
steps:
19+
- name: Checkout
20+
uses: actions/checkout@v4
21+
with:
22+
# With the default value of 1, there are corner cases where tj-actions/changed-files
23+
# fails with a `no merge base` error
24+
fetch-depth: 0
25+
26+
- name: Get changed files
27+
id: files
28+
uses: tj-actions/changed-files@v39
29+
with:
30+
files: |
31+
**/*.py
32+
files_ignore: |
33+
test/**
34+
rest_api/test/**
35+
36+
- uses: actions/setup-python@v4
37+
with:
38+
python-version: ${{ env.PYTHON_VERSION }}
39+
40+
- name: Install Haystack
41+
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
42+
43+
- name: Mypy
44+
if: steps.files.outputs.any_changed == 'true'
45+
run: |
46+
mkdir .mypy_cache/
47+
mypy --install-types --non-interactive ${{ steps.files.outputs.all_changed_files }} --exclude=rest_api/build/ --exclude=rest_api/test/
48+
49+
pylint:
50+
runs-on: ubuntu-latest
51+
steps:
52+
- name: Checkout
53+
uses: actions/checkout@v4
54+
with:
55+
# With the default value of 1, there are corner cases where tj-actions/changed-files
56+
# fails with a `no merge base` error
57+
fetch-depth: 0
58+
59+
- name: Get changed files
60+
id: files
61+
uses: tj-actions/changed-files@v39
62+
with:
63+
files: |
64+
**/*.py
65+
files_ignore: |
66+
test/**
67+
rest_api/test/**
68+
69+
- uses: actions/setup-python@v4
70+
with:
71+
python-version: ${{ env.PYTHON_VERSION }}
72+
73+
- name: Install Haystack
74+
run: |
75+
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
76+
pip install ./haystack-linter
77+
78+
- name: Pylint
79+
if: steps.files.outputs.any_changed == 'true'
80+
run: |
81+
pylint -ry -j 0 ${{ steps.files.outputs.all_changed_files }}

.github/workflows/linting_skipper.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ on:
66
paths-ignore:
77
- "**.py"
88
- "**/pyproject.toml"
9+
- "!haystack/preview/**/*.py"
10+
- "!test/preview/**/*.py"
11+
- "!e2e/preview/**/*.py"
912

1013
jobs:
1114
mypy:

.github/workflows/preview_imports.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
name: Verify preview imports only preview
2+
3+
on:
4+
pull_request:
5+
types:
6+
- opened
7+
- reopened
8+
- synchronize
9+
- ready_for_review
10+
paths:
11+
- "haystack/preview/**.py"
12+
13+
jobs:
14+
verify-imports:
15+
runs-on: ubuntu-latest
16+
steps:
17+
- name: Checkout
18+
uses: actions/checkout@v4
19+
with:
20+
# With the default value of 1, there are corner cases where tj-actions/changed-files
21+
# fails with a `no merge base` error
22+
fetch-depth: 0
23+
24+
- name: Get changed files
25+
id: files
26+
uses: tj-actions/changed-files@v39
27+
with:
28+
files: |
29+
haystack/preview/**.py
30+
31+
- name: Check imports
32+
shell: python
33+
run: |
34+
import re
35+
regex = r"^(from haystack|import haystack)(?!\.preview| import preview)(.*)"
36+
37+
changed_files = "${{ steps.files.outputs.all_changed_files }}".split()
38+
matches = {}
39+
for path in changed_files:
40+
with open(path, "r") as f:
41+
file_matches = []
42+
for line in f.readlines():
43+
file_matches.extend(re.finditer(regex, line.strip()))
44+
if file_matches:
45+
matches[path] = file_matches
46+
47+
for path, match in matches.items():
48+
print(f"Bad imports in file '{path}'")
49+
for m in match:
50+
print(m.group())
51+
print()
52+
53+
if matches:
54+
print("::error:: Imports in haystack.preview can only import from haystack.preview")
55+
import sys; sys.exit(1)

.github/workflows/tests.yml

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ on:
1919
- "pyproject.toml"
2020
- "!haystack/preview/**/*.py" # See tests_preview.yml
2121
- "!test/preview/**/*.py" # See tests_preview.yml
22+
- "!e2e/preview/**/*.py" # See e2e_preview.yml
2223
- "!.github/**/*.py"
2324
- "!rest_api/**/*.py"
2425
- "!docs/**/*.py"
@@ -124,10 +125,10 @@ jobs:
124125
include:
125126
- topic: document_stores
126127
os: ubuntu-latest
127-
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
128+
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
128129
- topic: document_stores
129130
os: windows-latest
130-
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
131+
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
131132
runs-on: ${{ matrix.os }}
132133
steps:
133134
- uses: actions/checkout@v4
@@ -329,7 +330,7 @@ jobs:
329330
runs-on: ${{ matrix.os }}
330331
services:
331332
elasticsearch:
332-
image: elasticsearch:8.8.0
333+
image: elasticsearch:8.10.2
333334
env:
334335
discovery.type: "single-node"
335336
xpack.security.enabled: "false"
@@ -346,9 +347,36 @@ jobs:
346347
- name: Install Haystack
347348
run: pip install .[elasticsearch8,dev,preprocessing,inference]
348349

350+
- name: Make elasticsearch comfortable with a disk almost full
351+
run: |
352+
curl -X PUT "localhost:9200/_cluster/settings?pretty" -H 'Content-Type: application/json' -d'
353+
{
354+
"persistent": {
355+
"cluster.routing.allocation.disk.watermark.low": "90%",
356+
"cluster.routing.allocation.disk.watermark.low.max_headroom": "100GB",
357+
"cluster.routing.allocation.disk.watermark.high": "95%",
358+
"cluster.routing.allocation.disk.watermark.high.max_headroom": "20GB",
359+
"cluster.routing.allocation.disk.watermark.flood_stage": "97%",
360+
"cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5GB",
361+
"cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%",
362+
"cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5GB"
363+
}
364+
}
365+
'
366+
curl -X PUT "localhost:9200/*/_settings?expand_wildcards=all&pretty" -H 'Content-Type: application/json' -d'
367+
{
368+
"index.blocks.read_only_allow_delete": null
369+
}
370+
'
371+
349372
- name: Run tests
350373
run: |
351-
pytest --maxfail=5 -m "document_store and integration" test/document_stores/test_elasticsearch.py
374+
pytest -x -m"document_store and integration" test/document_stores/test_elasticsearch.py
375+
376+
- name: logs
377+
if: failure()
378+
run: |
379+
docker logs "${{ job.services.elasticsearch.id }}"
352380
353381
- name: Calculate alert data
354382
id: calculator

.github/workflows/tests_skipper.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@ on:
1010
- ready_for_review
1111
paths-ignore:
1212
- "**.py"
13+
- "pyproject.toml"
1314
- "!haystack/preview/**/*.py" # See tests_preview.yml
1415
- "!test/preview/**/*.py" # See tests_preview.yml
15-
- "pyproject.toml"
16+
- "!e2e/preview/**/*.py" # See e2e_preview.yml
1617
- "!.github/**/*.py"
1718
- "!rest_api/**/*.py"
1819
- "!docs/**/*.py"

README.md

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,57 @@
99
| Meta | ![Discord](https://img.shields.io/discord/993534733298450452?logo=discord) ![Twitter Follow](https://img.shields.io/twitter/follow/deepset_ai) |
1010
</div>
1111

12-
[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case.
12+
[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision-making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case.
13+
14+
## Quickstart
15+
16+
Haystack is built around the concept of pipelines. A pipeline is a powerful structure that performs an NLP task. It's made up of components connected together. For example, you can connect a `Retriever` and a `PromptNode` to build a Generative Question Answering pipeline that uses your own data.
17+
18+
Try out how Haystack answers questions about Game of Thrones using the Retrieval Augmented Generation (RAG) approach 👇
19+
20+
First, run the minimal Haystack installation:
21+
22+
```sh
23+
pip install farm-haystack
24+
```
25+
26+
Then, index your data to the DocumentStore, build a RAG pipeline, and ask a question on your data:
27+
28+
```python
29+
from haystack.document_stores import InMemoryDocumentStore
30+
from haystack.utils import build_pipeline, add_example_data, print_answers
31+
32+
# We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai".
33+
provider = "openai"
34+
API_KEY = "sk-..." # ADD YOUR KEY HERE
35+
36+
# We support many different databases. Here, we load a simple and lightweight in-memory database.
37+
document_store = InMemoryDocumentStore(use_bm25=True)
38+
39+
# Download and add Game of Thrones TXT articles to Haystack DocumentStore.
40+
# You can also provide a folder with your local documents.
41+
add_example_data(document_store, "data/GoT_getting_started")
42+
43+
# Build a pipeline with a Retriever to get relevant documents to the query and a PromptNode interacting with LLMs using a custom prompt.
44+
pipeline = build_pipeline(provider, API_KEY, document_store)
45+
46+
# Ask a question on the data you just added.
47+
result = pipeline.run(query="Who is the father of Arya Stark?")
48+
49+
# For details, like which documents were used to generate the answer, look into the <result> object
50+
print_answers(result, details="medium")
51+
```
52+
53+
The output of the pipeline will reference the documents used to generate the answer:
54+
55+
```
56+
'Query: Who is the father of Arya Stark?'
57+
'Answers:'
58+
[{'answer': 'The father of Arya Stark is Lord Eddard Stark of '
59+
'Winterfell. [Document 1, Document 4, Document 5]'}]
60+
```
61+
62+
Congratulations, you have just built your first Haystack app!
1363

1464
## Core Concepts
1565

annotation_tool/docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ services:
1313
# DEFAULT_ADMIN_PASSWORD: "DEMO_PASSWORD"
1414
# COOKIE_KEYS: "somesafecookiekeys"
1515
# JWT_SECRET: "somesafesecret"
16+
# DOMAIN_WHITELIST: "*"
1617
ports:
1718
- "7001:7001"
1819
links:

e2e/preview/pipelines/test_extractive_qa_pipeline.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,39 @@
1+
import json
2+
13
from haystack.preview import Pipeline, Document
2-
from haystack.preview.document_stores import MemoryDocumentStore
3-
from haystack.preview.components.retrievers import MemoryBM25Retriever
4+
from haystack.preview.document_stores import InMemoryDocumentStore
5+
from haystack.preview.components.retrievers import InMemoryBM25Retriever
46
from haystack.preview.components.readers import ExtractiveReader
57

68

7-
def test_extractive_qa_pipeline():
8-
document_store = MemoryDocumentStore()
9+
def test_extractive_qa_pipeline(tmp_path):
10+
# Create the pipeline
11+
qa_pipeline = Pipeline()
12+
qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
13+
qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader")
14+
qa_pipeline.connect("retriever", "reader")
15+
16+
# Draw the pipeline
17+
qa_pipeline.draw(tmp_path / "test_extractive_qa_pipeline.png")
18+
19+
# Serialize the pipeline to JSON
20+
with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:
21+
print(json.dumps(qa_pipeline.to_dict(), indent=4))
22+
json.dump(qa_pipeline.to_dict(), f)
923

24+
# Load the pipeline back
25+
with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:
26+
qa_pipeline = Pipeline.from_dict(json.load(f))
27+
28+
# Populate the document store
1029
documents = [
1130
Document(text="My name is Jean and I live in Paris."),
1231
Document(text="My name is Mark and I live in Berlin."),
1332
Document(text="My name is Giorgio and I live in Rome."),
1433
]
34+
qa_pipeline.get_component("retriever").document_store.write_documents(documents)
1535

16-
document_store.write_documents(documents)
17-
18-
qa_pipeline = Pipeline()
19-
qa_pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever")
20-
qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader")
21-
qa_pipeline.connect("retriever", "reader")
22-
36+
# Query and assert
2337
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
2438
answers_spywords = ["Jean", "Mark", "Giorgio"]
2539

0 commit comments

Comments
 (0)