diff --git a/docs/evaluation/faq/unit-testing.mdx b/docs/evaluation/faq/unit-testing.mdx index 339b1254..3bec87a1 100644 --- a/docs/evaluation/faq/unit-testing.mdx +++ b/docs/evaluation/faq/unit-testing.mdx @@ -135,12 +135,12 @@ def test_embedding_similarity(query, expectation): prediction = my_chatbot(query) expect.embedding_distance( # This step logs the distance as feedback for this run - prediction=prediction, expectation=expectation + prediction=prediction, reference=expectation # Adding a matcher (in this case, 'to_be_*"), logs 'expectation' feedback ).to_be_less_than(0.5) # Optional predicate to assert against expect.edit_distance( # This computes the normalized Damerau-Levenshtein distance between the two strings - prediction=prediction, expectation=expectation + prediction=prediction, reference=expectation # If no predicate is provided below, 'assert' isn't called, but the score is still logged ) ``` @@ -195,8 +195,8 @@ The following metrics are available off-the-shelf: | -------------------- | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | | `pass` | Binary pass/fail score, 1 for pass, 0 for fail | `assert False` # Fails | | `expectation` | Binary expectation score, 1 if expectation is met, 0 if not | `expect(prediction).against(lambda x: re.search(r"\b[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\b", x)` ) | -| `embedding_distance` | Cosine distance between two embeddings | expect.embedding_distance(prediction=prediction, expectation=expectation) | -| `edit_distance` | Edit distance between two strings | expect.edit_distance(prediction=prediction, expectation=expectation) | +| `embedding_distance` | Cosine distance between two embeddings | expect.embedding_distance(prediction=prediction, reference=expectation) | +| `edit_distance` | Edit distance between two strings | expect.edit_distance(prediction=prediction, reference=expectation) | You can also log any arbitrary feeback within a unit test manually using the `client`. diff --git a/versioned_docs/version-2.0/tutorials/Developers/agents.mdx b/versioned_docs/version-2.0/tutorials/Developers/agents.mdx index ea2c371f..f53468fc 100644 --- a/versioned_docs/version-2.0/tutorials/Developers/agents.mdx +++ b/versioned_docs/version-2.0/tutorials/Developers/agents.mdx @@ -317,7 +317,6 @@ messages = graph.invoke(msg,config) messages['messages'][-1].content ## Stream -import uuid _printed = set() thread_id = str(uuid.uuid4()) @@ -367,7 +366,9 @@ First, create a dataset that evaluates end-to-end performance of the agent. We c ```python from langsmith import Client + client = Client() + # Create a dataset examples = [ ("Which country's customers spent the most? And how much did they spend?", "The country whose customers spent the most is the USA, with a total expenditure of $523.06"), @@ -464,13 +465,15 @@ We can check a specific tool call using [a custom evaluator](https://docs.smith. - We specify the `reference` tool call for the step that we are evaluating, `expected_tool_call`. ```python +from langsmith.schemas import Example, Run + def predict_assistant(example: dict): """Invoke assistant for single tool call evaluation""" msg = [ ("user", example["input"]) ] result = assistant_runnable.invoke({"messages":msg}) return {"response": result} -from langsmith.schemas import Example, Run + def check_specific_tool_call(root_run: Run, example: Example) -> dict: """ Check if the first tool call in the response matches the expected tool call. @@ -594,4 +597,4 @@ experiment_results = evaluate( You can see the results from the evaluations logged to the dataset! -https://smith.langchain.com/public/20808486-67c3-4e30-920b-6d49d6f2b6b8/d +[https://smith.langchain.com/public/20808486-67c3-4e30-920b-6d49d6f2b6b8/d](https://smith.langchain.com/public/20808486-67c3-4e30-920b-6d49d6f2b6b8/d)