fix: Run update_embeddings in examples (#6008)

* added hybrid search example Added an example about hybrid search for faq pipeline on covid dataset * formatted with back formatter * renamed document * fixed * fixed typos * added test added test for hybrid search * fixed withespaces * removed test for hybrid search * fixed pylint * commented logging * updated hybrid search example * release notes * Update hybrid_search_faq_pipeline.py-815df846dca7e872.yaml * Update hybrid_search_faq_pipeline.py * mention hybrid search example in release notes * reduce installed dependencies in examples test workflow * do not install cuda dependencies * skip models if API key not set; delete document indices * skip models if API key not set; delete document indices * skip models if API key not set; delete document indices * keep roberta-base model and inference extra * pylint * disable pylint no-logging-basicconfig rule --------- Co-authored-by: Julian Risch <[email protected]>
deepset-ai · Oct 10, 2023 · c102b15 · c102b15
1 parent c05f564
commit c102b15
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 22 deletions.
diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml
@@ -42,7 +42,9 @@ jobs:
           python-version: ${{ env.PYTHON_VERSION }}
 
       - name: Install Haystack
-        run: pip install .[all,dev]
+        run: |
+          pip install --upgrade pip
+          pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]
 
       - name: Run
         run: pytest examples/

diff --git a/examples/basic_faq_pipeline.py b/examples/basic_faq_pipeline.py
@@ -1,23 +1,25 @@
+# Disable pylint errors for logging basicConfig
+# pylint: disable=no-logging-basicconfig
 import logging
 
-logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
-logging.getLogger("haystack").setLevel(logging.INFO)
+import pandas as pd
 
 from haystack.document_stores import ElasticsearchDocumentStore
-
 from haystack.nodes import EmbeddingRetriever
 from haystack.nodes.other.docs2answers import Docs2Answers
-from haystack.utils import launch_es, print_answers, fetch_archive_from_http
-import pandas as pd
 from haystack.pipelines import Pipeline
+from haystack.utils import fetch_archive_from_http, launch_es, print_answers
+
+logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+logging.getLogger("haystack").setLevel(logging.INFO)
 
 
 def basic_faq_pipeline():
     document_store = ElasticsearchDocumentStore(
         host="localhost",
         username="",
         password="",
-        index="document",
+        index="example-document",
         embedding_field="question_emb",
         embedding_dim=384,
         excluded_meta_data=["question_emb"],
@@ -52,6 +54,7 @@ def basic_faq_pipeline():
     # Convert Dataframe to list of dicts and index them in our DocumentStore
     docs_to_index = df.to_dict(orient="records")
     document_store.write_documents(docs_to_index)
+    document_store.update_embeddings(retriever)
 
     # Initialize a Pipeline (this time without a reader) and ask questions
     pipeline = Pipeline()
@@ -62,6 +65,9 @@ def basic_faq_pipeline():
     prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
 
     print_answers(prediction, details="medium")
+
+    # Remove the index once we're done to save space
+    document_store.delete_index(index="example-document")
     return prediction
 
 

diff --git a/examples/basic_qa_pipeline.py b/examples/basic_qa_pipeline.py
@@ -1,21 +1,23 @@
+# Disable pylint errors for logging basicConfig
+# pylint: disable=no-logging-basicconfig
 import logging
 from pathlib import Path
 
-logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
-logging.getLogger("haystack").setLevel(logging.INFO)
-
 from haystack.document_stores import ElasticsearchDocumentStore
-from haystack.utils import fetch_archive_from_http, print_answers, launch_es
-from haystack.nodes import FARMReader, BM25Retriever
+from haystack.nodes import BM25Retriever, FARMReader
 from haystack.nodes.file_classifier import FileTypeClassifier
-from haystack.nodes.preprocessor import PreProcessor
 from haystack.nodes.file_converter import TextConverter
+from haystack.nodes.preprocessor import PreProcessor
 from haystack.pipelines import Pipeline
+from haystack.utils import fetch_archive_from_http, launch_es, print_answers
+
+logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+logging.getLogger("haystack").setLevel(logging.INFO)
 
 
 def basic_qa_pipeline():
     # Initialize a DocumentStore
-    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="example-document")
 
     # fetch, pre-process and write documents
     doc_dir = "data/basic_qa_pipeline"
@@ -66,6 +68,9 @@ def basic_qa_pipeline():
     )
 
     print_answers(prediction, details="minimum")
+
+    # Remove the index once we're done to save space
+    document_store.delete_index(index="example-document")
     return prediction
 
 

diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py
@@ -55,6 +55,7 @@ def hybrid_search_faq_pipeline():
     # Convert Dataframe to list of dicts and index them in our DocumentStore
     docs_to_index = df.to_dict(orient="records")
     document_store.write_documents(docs_to_index)
+    document_store.update_embeddings(retriever=dense_retriever)
 
     # Initialize a Pipeline (this time without a reader) and ask questions
     pipeline = Pipeline()

diff --git a/examples/test_basic_faq_pipeline.py b/examples/test_basic_faq_pipeline.py
@@ -1,6 +1,6 @@
 from examples.basic_faq_pipeline import basic_faq_pipeline
 
-from haystack.schema import Answer, Document
+from haystack.schema import Answer
 
 
 def test_basic_faq_pipeline():

diff --git a/examples/test_getting_started.py b/examples/test_getting_started.py
@@ -6,8 +6,7 @@
 from haystack.schema import Answer, Document
 
 
-@pytest.mark.integration
-@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
+@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"])
 def test_getting_started(provider):
     if provider == "anthropic":
         api_key = os.environ.get("ANTHROPIC_API_KEY", "")
@@ -17,9 +16,11 @@ def test_getting_started(provider):
         api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
     elif provider == "openai":
         api_key = os.environ.get("OPENAI_API_KEY", "")
-    result = getting_started(provider=provider, API_KEY=api_key)
 
-    # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
-    assert isinstance(result, dict)
-    assert type(result["answers"][0]) == Answer
-    assert type(result["documents"][0]) == Document
+    if api_key:
+        result = getting_started(provider=provider, API_KEY=api_key)
+
+        # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
+        assert isinstance(result, dict)
+        assert type(result["answers"][0]) == Answer
+        assert type(result["documents"][0]) == Document
diff --git a/releasenotes/notes/hybrid_search_faq_pipeline.py-815df846dca7e872.yaml b/releasenotes/notes/hybrid_search_faq_pipeline.py-815df846dca7e872.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Added documents_store.update_embeddings call to pipeline examples so that embeddings are calculated for newly added documents.