Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion camel/agents/chat_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,7 +1448,6 @@ async def _ahandle_stream_response(
]
usage_dict = self.get_usage_dict(output_messages, prompt_tokens)

# TODO: Handle tool calls
return ModelResponse(
response=response,
tool_call_requests=None,
Expand Down
172 changes: 164 additions & 8 deletions camel/utils/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def deduplicate_internally(

strategy is used to specify different strategies, where 'top1' selects the
one with highest similarity, and 'llm-supervise' uses LLM to determine if
texts are duplicates (not yet implemented).
texts are duplicates.

Args:
texts (List[str]): The list of texts to be deduplicated.
Expand Down Expand Up @@ -144,17 +144,11 @@ def deduplicate_internally(
unique_embeddings_dict={
0: embeddings[0]
if embeddings
else embedding_instance.embed_list(texts)[0] # type: ignore[union-attr]
else embedding_instance.embed_list(texts)[0] if embedding_instance else [0.0] # type: ignore[union-attr]
},
duplicate_to_target_map={},
)

if strategy == "llm-supervise":
# TODO: Implement LLM-supervise deduplication.
raise NotImplementedError(
"LLM-supervise deduplication is not yet implemented."
)

# Check if the parameters are valid.
if not 0 <= threshold <= 1:
raise ValueError("Threshold must be between 0 and 1")
Expand All @@ -169,6 +163,15 @@ def deduplicate_internally(
"Please choose only one way to supply embeddings."
)

if strategy == "llm-supervise":
return _deduplicate_with_llm_supervision(
texts=texts,
threshold=threshold,
embedding_instance=embedding_instance,
embeddings=embeddings,
batch_size=batch_size,
)

if embedding_instance is not None:
# Use Camel's embedding_instance to vectorize.
embeddings = embedding_instance.embed_list(texts)
Expand Down Expand Up @@ -230,3 +233,156 @@ def deduplicate_internally(
unique_embeddings_dict=unique_embeddings_dict,
duplicate_to_target_map=duplicate_to_target_map,
)


def _deduplicate_with_llm_supervision(
texts: List[str],
threshold: float,
embedding_instance: Optional[BaseEmbedding[str]],
embeddings: Optional[List[List[float]]],
batch_size: int,
) -> DeduplicationResult:
r"""Deduplicate texts using LLM supervision.

This function uses embeddings to find potential duplicates, then uses an LLM
to determine if they are actually duplicates.

Args:
texts: List of texts to deduplicate
threshold: Similarity threshold for initial filtering
embedding_instance: Embedding instance for computing embeddings
embeddings: Pre-computed embeddings
batch_size: Batch size for processing

Returns:
DeduplicationResult with LLM-supervised deduplication results
"""
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# First, get embeddings if not provided
if embedding_instance is not None:
embeddings = embedding_instance.embed_list(texts)
elif embeddings is None:
raise ValueError(
"Either 'embedding_instance' or 'embeddings' must be provided."
)

if len(embeddings) != len(texts):
raise ValueError(
"The length of 'embeddings' does not match the length of 'texts'."
)

# Convert embeddings to numpy array
embeddings_array = np.array(embeddings)
n = len(texts)
duplicate_to_target_map: Dict[int, int] = {}

# Find potential duplicates using cosine similarity
potential_duplicates = []

for i in range(0, n, batch_size):
batch_end = min(i + batch_size, n)
batch_similarities = cosine_similarity(
embeddings_array[i:batch_end], embeddings_array[:batch_end]
)

# Create mask for lower triangle
tril_mask = np.tril(np.ones_like(batch_similarities), k=-1)
batch_similarities = batch_similarities * tril_mask

# Find pairs above threshold
for j in range(batch_end - i):
for k in range(j):
if batch_similarities[j, k] > threshold:
potential_duplicates.append((i + j, k))

# Use LLM to determine actual duplicates
if potential_duplicates:
duplicate_pairs = _llm_judge_duplicates(texts, potential_duplicates)

# Build duplicate map
for duplicate_idx, target_idx in duplicate_pairs:
duplicate_to_target_map[duplicate_idx] = target_idx

# Get unique ids and embeddings
unique_ids = []
unique_embeddings_dict = {}

for i, (_, emb) in enumerate(zip(texts, embeddings)):
if i not in duplicate_to_target_map:
unique_ids.append(i)
unique_embeddings_dict[i] = emb

return DeduplicationResult(
original_texts=texts,
unique_ids=unique_ids,
unique_embeddings_dict=unique_embeddings_dict,
duplicate_to_target_map=duplicate_to_target_map,
)


def _llm_judge_duplicates(
texts: List[str],
potential_duplicates: List[tuple[int, int]]
) -> List[tuple[int, int]]:
r"""Use LLM to judge if potential duplicate pairs are actually duplicates.

Args:
texts: List of all texts
potential_duplicates: List of (duplicate_idx, target_idx) pairs

Returns:
List of (duplicate_idx, target_idx) pairs that LLM judged as duplicates
"""
try:
# Import here to avoid circular import issues
from camel.models import ModelFactory
from camel.types import ModelPlatformType
from camel.types.enums import ModelType

# Create a simple LLM model for judgment
# Using a lightweight model for efficiency
llm_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_3_5_TURBO,
)

actual_duplicates = []

for duplicate_idx, target_idx in potential_duplicates:
text1 = texts[duplicate_idx]
text2 = texts[target_idx]

# Create prompt for LLM judgment
prompt = f"""You are a text deduplication expert. Your task is to determine if two texts are duplicates or near-duplicates.

Text 1: "{text1}"
Text 2: "{text2}"

Are these texts duplicates or near-duplicates? Consider:
- Semantic similarity
- Information overlap
- Whether they convey the same meaning

Respond with only "YES" if they are duplicates, or "NO" if they are not duplicates."""

try:
# Get LLM response
response = llm_model.run([{"role": "user", "content": prompt}])
judgment = response.choices[0].message.content.strip().upper()

if judgment == "YES":
actual_duplicates.append((duplicate_idx, target_idx))

except Exception as e:
# If LLM fails, default to keeping both texts (no deduplication)
print(f"LLM judgment failed for pair {duplicate_idx}-{target_idx}: {e}")
continue

return actual_duplicates

except Exception as e:
print(f"Failed to initialize LLM for deduplication: {e}")
# Fallback: return empty list (no deduplication)
return []
164 changes: 164 additions & 0 deletions test/utils/test_deduplication_llm_supervise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import pytest

from camel.utils.deduplication import deduplicate_internally, DeduplicationResult


class TestLLMSuperviseDeduplication:
"""Test cases for LLM-supervise deduplication strategy."""

def test_llm_supervise_strategy_exists(self):
"""Test that the llm-supervise strategy is recognized."""
# Test that the function accepts the strategy parameter
# This should not raise a NotImplementedError anymore
# Single text doesn't need embeddings, so it should work
result = deduplicate_internally(
texts=["test"],
strategy="llm-supervise"
)

# Should return a valid result for single text
assert isinstance(result, DeduplicationResult)
assert result.original_texts == ["test"]
assert result.unique_ids == [0]
assert result.duplicate_to_target_map == {}

def test_llm_supervise_strategy_parameter(self):
"""Test that the strategy parameter accepts llm-supervise."""
import inspect
sig = inspect.signature(deduplicate_internally)
strategy_param = sig.parameters['strategy']

# Check that 'llm-supervise' is a valid option
if hasattr(strategy_param.annotation, '__args__'):
valid_strategies = strategy_param.annotation.__args__
assert 'llm-supervise' in valid_strategies, f"llm-supervise not in valid strategies: {valid_strategies}"
assert 'top1' in valid_strategies, f"top1 not in valid strategies: {valid_strategies}"

def test_llm_supervise_empty_texts(self):
"""Test llm-supervise with empty text list."""
result = deduplicate_internally(
texts=[],
strategy="llm-supervise"
)

assert isinstance(result, DeduplicationResult)
assert result.original_texts == []
assert result.unique_ids == []
assert result.unique_embeddings_dict == {}
assert result.duplicate_to_target_map == {}

def test_llm_supervise_single_text(self):
"""Test llm-supervise with single text."""
texts = ["Single text"]

# Mock embeddings for testing
mock_embeddings = [[0.1, 0.2, 0.3]]

result = deduplicate_internally(
texts=texts,
embeddings=mock_embeddings,
strategy="llm-supervise"
)

assert isinstance(result, DeduplicationResult)
assert result.original_texts == texts
assert result.unique_ids == [0]
assert result.unique_embeddings_dict == {0: mock_embeddings[0]}
assert result.duplicate_to_target_map == {}

def test_llm_supervise_invalid_threshold(self):
"""Test llm-supervise with invalid threshold."""
texts = ["text1", "text2"]
mock_embeddings = [[0.1, 0.2], [0.3, 0.4]]

with pytest.raises(ValueError, match="Threshold must be between 0 and 1"):
deduplicate_internally(
texts=texts,
embeddings=mock_embeddings,
threshold=1.5, # Invalid threshold
strategy="llm-supervise"
)

def test_llm_supervise_missing_embeddings(self):
"""Test llm-supervise without providing embeddings."""
texts = ["text1", "text2"]

with pytest.raises(ValueError, match="Either 'embedding_instance' or 'embeddings' must be provided"):
deduplicate_internally(
texts=texts,
strategy="llm-supervise"
)

def test_llm_supervise_mismatched_embeddings_length(self):
"""Test llm-supervise with mismatched embeddings length."""
texts = ["text1", "text2"]
mock_embeddings = [[0.1, 0.2]] # Only one embedding for two texts

with pytest.raises(ValueError, match="The length of 'embeddings' does not match the length of 'texts'"):
deduplicate_internally(
texts=texts,
embeddings=mock_embeddings,
strategy="llm-supervise"
)

def test_llm_supervise_both_embedding_sources_provided(self):
"""Test llm-supervise with both embedding_instance and embeddings."""
texts = ["text1", "text2"]
mock_embeddings = [[0.1, 0.2], [0.3, 0.4]]

# Mock embedding instance
class MockEmbedding:
def embed_list(self, texts):
return [[0.1, 0.2], [0.3, 0.4]]

with pytest.raises(ValueError, match="Cannot provide both 'embedding_instance' and 'embeddings'"):
deduplicate_internally(
texts=texts,
embeddings=mock_embeddings,
embedding_instance=MockEmbedding(),
strategy="llm-supervise"
)

def test_llm_supervise_basic_functionality(self):
"""Test basic llm-supervise functionality with mock data."""
texts = [
"What is artificial intelligence?",
"AI is a field of computer science",
"What is artificial intelligence?", # Duplicate
"Deep learning is a subset of AI",
]

# Mock embeddings with high similarity for duplicates
mock_embeddings = [
[0.1, 0.2, 0.3], # text 0
[0.4, 0.5, 0.6], # text 1
[0.1, 0.2, 0.3], # text 2 (same as text 0)
[0.7, 0.8, 0.9], # text 3
]

result = deduplicate_internally(
texts=texts,
embeddings=mock_embeddings,
threshold=0.8, # High threshold to avoid false positives in test
strategy="llm-supervise"
)

assert isinstance(result, DeduplicationResult)
assert result.original_texts == texts
# Should have unique IDs (exact behavior depends on LLM judgment)
assert len(result.unique_ids) >= 1
assert len(result.unique_embeddings_dict) >= 1