Skip to content

Commit

Permalink
fix: Run update_embeddings in examples (#6008)
Browse files Browse the repository at this point in the history
* added hybrid search example

Added an example about hybrid search for faq pipeline on covid dataset

* formatted with back formatter

* renamed document

* fixed

* fixed typos

* added test

added test for hybrid search

* fixed withespaces

* removed test for hybrid search

* fixed pylint

* commented logging

* updated hybrid search example

* release notes

* Update hybrid_search_faq_pipeline.py-815df846dca7e872.yaml

* Update hybrid_search_faq_pipeline.py

* mention hybrid search example in release notes

* reduce installed dependencies in examples test workflow

* do not install cuda dependencies

* skip models if API key not set; delete document indices

* skip models if API key not set; delete document indices

* skip models if API key not set; delete document indices

* keep roberta-base model and inference extra

* pylint

* disable pylint no-logging-basicconfig rule

---------

Co-authored-by: Julian Risch <[email protected]>
  • Loading branch information
nickprock and julian-risch authored Oct 10, 2023
1 parent c05f564 commit c102b15
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 22 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/examples_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}

- name: Install Haystack
run: pip install .[all,dev]
run: |
pip install --upgrade pip
pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]
- name: Run
run: pytest examples/
Expand Down
18 changes: 12 additions & 6 deletions examples/basic_faq_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
# Disable pylint errors for logging basicConfig
# pylint: disable=no-logging-basicconfig
import logging

logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
import pandas as pd

from haystack.document_stores import ElasticsearchDocumentStore

from haystack.nodes import EmbeddingRetriever
from haystack.nodes.other.docs2answers import Docs2Answers
from haystack.utils import launch_es, print_answers, fetch_archive_from_http
import pandas as pd
from haystack.pipelines import Pipeline
from haystack.utils import fetch_archive_from_http, launch_es, print_answers

logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


def basic_faq_pipeline():
document_store = ElasticsearchDocumentStore(
host="localhost",
username="",
password="",
index="document",
index="example-document",
embedding_field="question_emb",
embedding_dim=384,
excluded_meta_data=["question_emb"],
Expand Down Expand Up @@ -52,6 +54,7 @@ def basic_faq_pipeline():
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)
document_store.update_embeddings(retriever)

# Initialize a Pipeline (this time without a reader) and ask questions
pipeline = Pipeline()
Expand All @@ -62,6 +65,9 @@ def basic_faq_pipeline():
prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})

print_answers(prediction, details="medium")

# Remove the index once we're done to save space
document_store.delete_index(index="example-document")
return prediction


Expand Down
19 changes: 12 additions & 7 deletions examples/basic_qa_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
# Disable pylint errors for logging basicConfig
# pylint: disable=no-logging-basicconfig
import logging
from pathlib import Path

logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http, print_answers, launch_es
from haystack.nodes import FARMReader, BM25Retriever
from haystack.nodes import BM25Retriever, FARMReader
from haystack.nodes.file_classifier import FileTypeClassifier
from haystack.nodes.preprocessor import PreProcessor
from haystack.nodes.file_converter import TextConverter
from haystack.nodes.preprocessor import PreProcessor
from haystack.pipelines import Pipeline
from haystack.utils import fetch_archive_from_http, launch_es, print_answers

logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


def basic_qa_pipeline():
# Initialize a DocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="example-document")

# fetch, pre-process and write documents
doc_dir = "data/basic_qa_pipeline"
Expand Down Expand Up @@ -66,6 +68,9 @@ def basic_qa_pipeline():
)

print_answers(prediction, details="minimum")

# Remove the index once we're done to save space
document_store.delete_index(index="example-document")
return prediction


Expand Down
1 change: 1 addition & 0 deletions examples/hybrid_search_faq_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def hybrid_search_faq_pipeline():
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)
document_store.update_embeddings(retriever=dense_retriever)

# Initialize a Pipeline (this time without a reader) and ask questions
pipeline = Pipeline()
Expand Down
2 changes: 1 addition & 1 deletion examples/test_basic_faq_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from examples.basic_faq_pipeline import basic_faq_pipeline

from haystack.schema import Answer, Document
from haystack.schema import Answer


def test_basic_faq_pipeline():
Expand Down
15 changes: 8 additions & 7 deletions examples/test_getting_started.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from haystack.schema import Answer, Document


@pytest.mark.integration
@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"])
def test_getting_started(provider):
if provider == "anthropic":
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
Expand All @@ -17,9 +16,11 @@ def test_getting_started(provider):
api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
elif provider == "openai":
api_key = os.environ.get("OPENAI_API_KEY", "")
result = getting_started(provider=provider, API_KEY=api_key)

# Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
assert isinstance(result, dict)
assert type(result["answers"][0]) == Answer
assert type(result["documents"][0]) == Document
if api_key:
result = getting_started(provider=provider, API_KEY=api_key)

# Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
assert isinstance(result, dict)
assert type(result["answers"][0]) == Answer
assert type(result["documents"][0]) == Document
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Added documents_store.update_embeddings call to pipeline examples so that embeddings are calculated for newly added documents.

0 comments on commit c102b15

Please sign in to comment.