fix: fix test failures with Transformers models in PRs from forks (#8809

) * trigger * try pinning sentence transformers * make integr tests run right away * pin transformers instead * older transformers version * rm transformers pin * try ignoring cache * change ubuntu version * try removing token * try again * more HF_API_TOKEN local deletions * restore test priority * rm leftover * more deletions * moreee * more * deletions * restore jobs order
deepset-ai · Feb 4, 2025 · 5ae9488 · 5ae9488
1 parent f1679f1
commit 5ae9488
Show file tree

Hide file tree

Showing 9 changed files with 38 additions and 19 deletions.
diff --git a/test/components/classifiers/test_zero_shot_document_classifier.py b/test/components/classifiers/test_zero_shot_document_classifier.py
@@ -137,7 +137,8 @@ def test_run_unit(self, hf_pipeline_mock):
         assert result["documents"][1].to_dict()["classification"]["label"] == "negative"
 
     @pytest.mark.integration
-    def test_run(self):
+    def test_run(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         component = TransformersZeroShotDocumentClassifier(
             model="cross-encoder/nli-deberta-v3-xsmall", labels=["positive", "negative"]
         )

diff --git a/test/components/embedders/test_sentence_transformers_text_embedder.py b/test/components/embedders/test_sentence_transformers_text_embedder.py
@@ -261,10 +261,11 @@ def test_run_wrong_input_format(self):
             embedder.run(text=list_integers_input)
 
     @pytest.mark.integration
-    def test_run_trunc(self):
+    def test_run_trunc(self, monkeypatch):
         """
         sentence-transformers/paraphrase-albert-small-v2 maps sentences & paragraphs to a 768 dimensional dense vector space
         """
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         checkpoint = "sentence-transformers/paraphrase-albert-small-v2"
         text = "a nice text to embed"
 

diff --git a/test/components/evaluators/test_sas_evaluator.py b/test/components/evaluators/test_sas_evaluator.py
@@ -104,7 +104,8 @@ def test_run_not_warmed_up(self):
             evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions)
 
     @pytest.mark.integration
-    def test_run_with_matching_predictions(self):
+    def test_run_with_matching_predictions(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         evaluator = SASEvaluator()
         ground_truths = [
             "A construction budget of US $2.3 billion",
@@ -124,7 +125,8 @@ def test_run_with_matching_predictions(self):
         assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0])
 
     @pytest.mark.integration
-    def test_run_with_single_prediction(self):
+    def test_run_with_single_prediction(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         evaluator = SASEvaluator()
 
         ground_truths = ["US $2.3 billion"]
@@ -137,7 +139,8 @@ def test_run_with_single_prediction(self):
         assert result["individual_scores"] == pytest.approx([0.689089], abs=1e-5)
 
     @pytest.mark.integration
-    def test_run_with_mismatched_predictions(self):
+    def test_run_with_mismatched_predictions(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         evaluator = SASEvaluator()
         ground_truths = [
             "US $2.3 billion",
@@ -156,7 +159,8 @@ def test_run_with_mismatched_predictions(self):
         assert result["individual_scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5)
 
     @pytest.mark.integration
-    def test_run_with_bi_encoder_model(self):
+    def test_run_with_bi_encoder_model(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2")
         ground_truths = [
             "A construction budget of US $2.3 billion",
@@ -175,7 +179,8 @@ def test_run_with_bi_encoder_model(self):
         assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0])
 
     @pytest.mark.integration
-    def test_run_with_cross_encoder_model(self):
+    def test_run_with_cross_encoder_model(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
         ground_truths = [
             "A construction budget of US $2.3 billion",

diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
@@ -293,7 +293,8 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock):
 
     @pytest.mark.integration
     @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    def test_live_run(self):
+    def test_live_run(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
 
         llm = HuggingFaceLocalChatGenerator(

diff --git a/test/components/generators/test_hugging_face_local_generator.py b/test/components/generators/test_hugging_face_local_generator.py
@@ -454,8 +454,9 @@ def test_stop_words_criteria_using_hf_tokenizer(self):
         assert criteria(generated_text_ids, scores=None) is True
 
     @pytest.mark.integration
-    def test_hf_pipeline_runs_with_our_criteria(self):
+    def test_hf_pipeline_runs_with_our_criteria(self, monkeypatch):
         """Test that creating our own StopWordsCriteria and passing it to a Huggingface pipeline works."""
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         generator = HuggingFaceLocalGenerator(
             model="google/flan-t5-small", task="text2text-generation", stop_words=["unambiguously"]
         )
@@ -466,7 +467,8 @@ def test_hf_pipeline_runs_with_our_criteria(self):
 
     @pytest.mark.integration
     @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    def test_live_run(self):
+    def test_live_run(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         llm = HuggingFaceLocalGenerator(model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50})
         llm.warm_up()
 

diff --git a/test/components/rankers/test_sentence_transformers_diversity.py b/test/components/rankers/test_sentence_transformers_diversity.py
@@ -574,10 +574,11 @@ def test_pipeline_serialise_deserialise(self):
 
     @pytest.mark.integration
     @pytest.mark.parametrize("similarity", ["dot_product", "cosine"])
-    def test_run(self, similarity):
+    def test_run(self, similarity, monkeypatch):
         """
         Tests that run method returns documents in the correct order
         """
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         ranker = SentenceTransformersDiversityRanker(
             model="sentence-transformers/all-MiniLM-L6-v2", similarity=similarity
         )
@@ -601,7 +602,8 @@ def test_run(self, similarity):
 
     @pytest.mark.integration
     @pytest.mark.parametrize("similarity", ["dot_product", "cosine"])
-    def test_run_real_world_use_case(self, similarity):
+    def test_run_real_world_use_case(self, similarity, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         ranker = SentenceTransformersDiversityRanker(
             model="sentence-transformers/all-MiniLM-L6-v2", similarity=similarity
         )
@@ -673,7 +675,8 @@ def test_run_real_world_use_case(self, similarity):
 
     @pytest.mark.integration
     @pytest.mark.parametrize("similarity", ["dot_product", "cosine"])
-    def test_run_with_maximum_margin_relevance_strategy(self, similarity):
+    def test_run_with_maximum_margin_relevance_strategy(self, similarity, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         query = "renewable energy sources"
         docs = [
             Document(content="18th-century French literature"),

diff --git a/test/components/readers/test_extractive.py b/test/components/readers/test_extractive.py
@@ -776,7 +776,8 @@ def test_deduplicate_by_overlap(
 
 
 @pytest.mark.integration
-def test_t5():
+def test_t5(monkeypatch):
+    monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
     reader = ExtractiveReader("sjrhuschlee/flan-t5-base-squad2")
     reader.warm_up()
     answers = reader.run(example_queries[0], example_documents[0], top_k=2)[
@@ -800,7 +801,8 @@ def test_t5():
 
 
 @pytest.mark.integration
-def test_roberta():
+def test_roberta(monkeypatch):
+    monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
     reader = ExtractiveReader("deepset/tinyroberta-squad2")
     reader.warm_up()
     answers = reader.run(example_queries[0], example_documents[0], top_k=2)[
@@ -829,7 +831,8 @@ def test_roberta():
 
 
 @pytest.mark.integration
-def test_matches_hf_pipeline():
+def test_matches_hf_pipeline(monkeypatch):
+    monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
     reader = ExtractiveReader(
         "deepset/tinyroberta-squad2", device=ComponentDevice.from_str("cpu"), overlap_threshold=None
     )

diff --git a/test/components/routers/test_transformers_text_router.py b/test/components/routers/test_transformers_text_router.py
@@ -172,7 +172,8 @@ def test_run_unit(self, hf_pipeline_mock, mock_auto_config_from_pretrained):
         assert out == {"en": "What is the color of the sky?"}
 
     @pytest.mark.integration
-    def test_run(self):
+    def test_run(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         router = TransformersTextRouter(model="papluca/xlm-roberta-base-language-detection")
         router.warm_up()
         out = router.run("What is the color of the sky?")
@@ -202,7 +203,8 @@ def test_run(self):
         assert out == {"en": "What is the color of the sky?"}
 
     @pytest.mark.integration
-    def test_wrong_labels(self):
+    def test_wrong_labels(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         router = TransformersTextRouter(model="papluca/xlm-roberta-base-language-detection", labels=["en", "de"])
         with pytest.raises(ValueError):
             router.warm_up()
diff --git a/test/components/routers/test_zero_shot_text_router.py b/test/components/routers/test_zero_shot_text_router.py
@@ -106,7 +106,8 @@ def test_run_unit(self, hf_pipeline_mock):
         assert out == {"query": "What is the color of the sky?"}
 
     @pytest.mark.integration
-    def test_run(self):
+    def test_run(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         router = TransformersZeroShotTextRouter(labels=["query", "passage"])
         router.warm_up()
         out = router.run("What is the color of the sky?")