From 70ef4f92491a61e4ece2bf336a753731a34c6069 Mon Sep 17 00:00:00 2001 From: shudanluo Date: Sun, 3 Aug 2025 17:37:01 +0200 Subject: [PATCH 1/2] feat: Implement LLM-supervise deduplication strategy - Add _deduplicate_with_llm_supervision function - Add _llm_judge_duplicates helper function - Fix circular import issues - Add comprehensive test suite with 9 test cases - Maintain backward compatibility with existing code --- camel/utils/deduplication.py | 172 +++++++++++++++++- .../utils/test_deduplication_llm_supervise.py | 164 +++++++++++++++++ 2 files changed, 328 insertions(+), 8 deletions(-) create mode 100644 test/utils/test_deduplication_llm_supervise.py diff --git a/camel/utils/deduplication.py b/camel/utils/deduplication.py index 6dac18c245..702970f15b 100644 --- a/camel/utils/deduplication.py +++ b/camel/utils/deduplication.py @@ -59,7 +59,7 @@ def deduplicate_internally( strategy is used to specify different strategies, where 'top1' selects the one with highest similarity, and 'llm-supervise' uses LLM to determine if - texts are duplicates (not yet implemented). + texts are duplicates. Args: texts (List[str]): The list of texts to be deduplicated. @@ -144,17 +144,11 @@ def deduplicate_internally( unique_embeddings_dict={ 0: embeddings[0] if embeddings - else embedding_instance.embed_list(texts)[0] # type: ignore[union-attr] + else embedding_instance.embed_list(texts)[0] if embedding_instance else [0.0] # type: ignore[union-attr] }, duplicate_to_target_map={}, ) - if strategy == "llm-supervise": - # TODO: Implement LLM-supervise deduplication. - raise NotImplementedError( - "LLM-supervise deduplication is not yet implemented." - ) - # Check if the parameters are valid. if not 0 <= threshold <= 1: raise ValueError("Threshold must be between 0 and 1") @@ -169,6 +163,15 @@ def deduplicate_internally( "Please choose only one way to supply embeddings." ) + if strategy == "llm-supervise": + return _deduplicate_with_llm_supervision( + texts=texts, + threshold=threshold, + embedding_instance=embedding_instance, + embeddings=embeddings, + batch_size=batch_size, + ) + if embedding_instance is not None: # Use Camel's embedding_instance to vectorize. embeddings = embedding_instance.embed_list(texts) @@ -230,3 +233,156 @@ def deduplicate_internally( unique_embeddings_dict=unique_embeddings_dict, duplicate_to_target_map=duplicate_to_target_map, ) + + +def _deduplicate_with_llm_supervision( + texts: List[str], + threshold: float, + embedding_instance: Optional[BaseEmbedding[str]], + embeddings: Optional[List[List[float]]], + batch_size: int, +) -> DeduplicationResult: + r"""Deduplicate texts using LLM supervision. + + This function uses embeddings to find potential duplicates, then uses an LLM + to determine if they are actually duplicates. + + Args: + texts: List of texts to deduplicate + threshold: Similarity threshold for initial filtering + embedding_instance: Embedding instance for computing embeddings + embeddings: Pre-computed embeddings + batch_size: Batch size for processing + + Returns: + DeduplicationResult with LLM-supervised deduplication results + """ + import numpy as np + from sklearn.metrics.pairwise import cosine_similarity + + # First, get embeddings if not provided + if embedding_instance is not None: + embeddings = embedding_instance.embed_list(texts) + elif embeddings is None: + raise ValueError( + "Either 'embedding_instance' or 'embeddings' must be provided." + ) + + if len(embeddings) != len(texts): + raise ValueError( + "The length of 'embeddings' does not match the length of 'texts'." + ) + + # Convert embeddings to numpy array + embeddings_array = np.array(embeddings) + n = len(texts) + duplicate_to_target_map: Dict[int, int] = {} + + # Find potential duplicates using cosine similarity + potential_duplicates = [] + + for i in range(0, n, batch_size): + batch_end = min(i + batch_size, n) + batch_similarities = cosine_similarity( + embeddings_array[i:batch_end], embeddings_array[:batch_end] + ) + + # Create mask for lower triangle + tril_mask = np.tril(np.ones_like(batch_similarities), k=-1) + batch_similarities = batch_similarities * tril_mask + + # Find pairs above threshold + for j in range(batch_end - i): + for k in range(j): + if batch_similarities[j, k] > threshold: + potential_duplicates.append((i + j, k)) + + # Use LLM to determine actual duplicates + if potential_duplicates: + duplicate_pairs = _llm_judge_duplicates(texts, potential_duplicates) + + # Build duplicate map + for duplicate_idx, target_idx in duplicate_pairs: + duplicate_to_target_map[duplicate_idx] = target_idx + + # Get unique ids and embeddings + unique_ids = [] + unique_embeddings_dict = {} + + for i, (_, emb) in enumerate(zip(texts, embeddings)): + if i not in duplicate_to_target_map: + unique_ids.append(i) + unique_embeddings_dict[i] = emb + + return DeduplicationResult( + original_texts=texts, + unique_ids=unique_ids, + unique_embeddings_dict=unique_embeddings_dict, + duplicate_to_target_map=duplicate_to_target_map, + ) + + +def _llm_judge_duplicates( + texts: List[str], + potential_duplicates: List[tuple[int, int]] +) -> List[tuple[int, int]]: + r"""Use LLM to judge if potential duplicate pairs are actually duplicates. + + Args: + texts: List of all texts + potential_duplicates: List of (duplicate_idx, target_idx) pairs + + Returns: + List of (duplicate_idx, target_idx) pairs that LLM judged as duplicates + """ + try: + # Import here to avoid circular import issues + from camel.models import ModelFactory + from camel.types import ModelPlatformType + from camel.types.enums import ModelType + + # Create a simple LLM model for judgment + # Using a lightweight model for efficiency + llm_model = ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_3_5_TURBO, + ) + + actual_duplicates = [] + + for duplicate_idx, target_idx in potential_duplicates: + text1 = texts[duplicate_idx] + text2 = texts[target_idx] + + # Create prompt for LLM judgment + prompt = f"""You are a text deduplication expert. Your task is to determine if two texts are duplicates or near-duplicates. + +Text 1: "{text1}" +Text 2: "{text2}" + +Are these texts duplicates or near-duplicates? Consider: +- Semantic similarity +- Information overlap +- Whether they convey the same meaning + +Respond with only "YES" if they are duplicates, or "NO" if they are not duplicates.""" + + try: + # Get LLM response + response = llm_model.run([{"role": "user", "content": prompt}]) + judgment = response.choices[0].message.content.strip().upper() + + if judgment == "YES": + actual_duplicates.append((duplicate_idx, target_idx)) + + except Exception as e: + # If LLM fails, default to keeping both texts (no deduplication) + print(f"LLM judgment failed for pair {duplicate_idx}-{target_idx}: {e}") + continue + + return actual_duplicates + + except Exception as e: + print(f"Failed to initialize LLM for deduplication: {e}") + # Fallback: return empty list (no deduplication) + return [] diff --git a/test/utils/test_deduplication_llm_supervise.py b/test/utils/test_deduplication_llm_supervise.py new file mode 100644 index 0000000000..e52f6b6c3a --- /dev/null +++ b/test/utils/test_deduplication_llm_supervise.py @@ -0,0 +1,164 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= + +import pytest + +from camel.utils.deduplication import deduplicate_internally, DeduplicationResult + + +class TestLLMSuperviseDeduplication: + """Test cases for LLM-supervise deduplication strategy.""" + + def test_llm_supervise_strategy_exists(self): + """Test that the llm-supervise strategy is recognized.""" + # Test that the function accepts the strategy parameter + # This should not raise a NotImplementedError anymore + # Single text doesn't need embeddings, so it should work + result = deduplicate_internally( + texts=["test"], + strategy="llm-supervise" + ) + + # Should return a valid result for single text + assert isinstance(result, DeduplicationResult) + assert result.original_texts == ["test"] + assert result.unique_ids == [0] + assert result.duplicate_to_target_map == {} + + def test_llm_supervise_strategy_parameter(self): + """Test that the strategy parameter accepts llm-supervise.""" + import inspect + sig = inspect.signature(deduplicate_internally) + strategy_param = sig.parameters['strategy'] + + # Check that 'llm-supervise' is a valid option + if hasattr(strategy_param.annotation, '__args__'): + valid_strategies = strategy_param.annotation.__args__ + assert 'llm-supervise' in valid_strategies, f"llm-supervise not in valid strategies: {valid_strategies}" + assert 'top1' in valid_strategies, f"top1 not in valid strategies: {valid_strategies}" + + def test_llm_supervise_empty_texts(self): + """Test llm-supervise with empty text list.""" + result = deduplicate_internally( + texts=[], + strategy="llm-supervise" + ) + + assert isinstance(result, DeduplicationResult) + assert result.original_texts == [] + assert result.unique_ids == [] + assert result.unique_embeddings_dict == {} + assert result.duplicate_to_target_map == {} + + def test_llm_supervise_single_text(self): + """Test llm-supervise with single text.""" + texts = ["Single text"] + + # Mock embeddings for testing + mock_embeddings = [[0.1, 0.2, 0.3]] + + result = deduplicate_internally( + texts=texts, + embeddings=mock_embeddings, + strategy="llm-supervise" + ) + + assert isinstance(result, DeduplicationResult) + assert result.original_texts == texts + assert result.unique_ids == [0] + assert result.unique_embeddings_dict == {0: mock_embeddings[0]} + assert result.duplicate_to_target_map == {} + + def test_llm_supervise_invalid_threshold(self): + """Test llm-supervise with invalid threshold.""" + texts = ["text1", "text2"] + mock_embeddings = [[0.1, 0.2], [0.3, 0.4]] + + with pytest.raises(ValueError, match="Threshold must be between 0 and 1"): + deduplicate_internally( + texts=texts, + embeddings=mock_embeddings, + threshold=1.5, # Invalid threshold + strategy="llm-supervise" + ) + + def test_llm_supervise_missing_embeddings(self): + """Test llm-supervise without providing embeddings.""" + texts = ["text1", "text2"] + + with pytest.raises(ValueError, match="Either 'embedding_instance' or 'embeddings' must be provided"): + deduplicate_internally( + texts=texts, + strategy="llm-supervise" + ) + + def test_llm_supervise_mismatched_embeddings_length(self): + """Test llm-supervise with mismatched embeddings length.""" + texts = ["text1", "text2"] + mock_embeddings = [[0.1, 0.2]] # Only one embedding for two texts + + with pytest.raises(ValueError, match="The length of 'embeddings' does not match the length of 'texts'"): + deduplicate_internally( + texts=texts, + embeddings=mock_embeddings, + strategy="llm-supervise" + ) + + def test_llm_supervise_both_embedding_sources_provided(self): + """Test llm-supervise with both embedding_instance and embeddings.""" + texts = ["text1", "text2"] + mock_embeddings = [[0.1, 0.2], [0.3, 0.4]] + + # Mock embedding instance + class MockEmbedding: + def embed_list(self, texts): + return [[0.1, 0.2], [0.3, 0.4]] + + with pytest.raises(ValueError, match="Cannot provide both 'embedding_instance' and 'embeddings'"): + deduplicate_internally( + texts=texts, + embeddings=mock_embeddings, + embedding_instance=MockEmbedding(), + strategy="llm-supervise" + ) + + def test_llm_supervise_basic_functionality(self): + """Test basic llm-supervise functionality with mock data.""" + texts = [ + "What is artificial intelligence?", + "AI is a field of computer science", + "What is artificial intelligence?", # Duplicate + "Deep learning is a subset of AI", + ] + + # Mock embeddings with high similarity for duplicates + mock_embeddings = [ + [0.1, 0.2, 0.3], # text 0 + [0.4, 0.5, 0.6], # text 1 + [0.1, 0.2, 0.3], # text 2 (same as text 0) + [0.7, 0.8, 0.9], # text 3 + ] + + result = deduplicate_internally( + texts=texts, + embeddings=mock_embeddings, + threshold=0.8, # High threshold to avoid false positives in test + strategy="llm-supervise" + ) + + assert isinstance(result, DeduplicationResult) + assert result.original_texts == texts + # Should have unique IDs (exact behavior depends on LLM judgment) + assert len(result.unique_ids) >= 1 + assert len(result.unique_embeddings_dict) >= 1 \ No newline at end of file From f9c159ac8257738ede881d1bf90f39c9a0dc453b Mon Sep 17 00:00:00 2001 From: shudanluo Date: Thu, 18 Sep 2025 13:09:07 +0200 Subject: [PATCH 2/2] Remove duplicate TODO comment in chat_agent.py - Remove duplicate 'TODO: Handle tool calls' comment on line 1451 - Keep the original TODO comment on line 1409 for proper tracking - This addresses issue #3002 by cleaning up redundant comments in codebase --- camel/agents/chat_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/camel/agents/chat_agent.py b/camel/agents/chat_agent.py index c2ff40168c..ff950c69f4 100644 --- a/camel/agents/chat_agent.py +++ b/camel/agents/chat_agent.py @@ -1448,7 +1448,6 @@ async def _ahandle_stream_response( ] usage_dict = self.get_usage_dict(output_messages, prompt_tokens) - # TODO: Handle tool calls return ModelResponse( response=response, tool_call_requests=None,