diff --git a/docs/evaluation/faq/unit-testing.mdx b/docs/evaluation/faq/unit-testing.mdx
index 339b1254..3bec87a1 100644
--- a/docs/evaluation/faq/unit-testing.mdx
+++ b/docs/evaluation/faq/unit-testing.mdx
@@ -135,12 +135,12 @@ def test_embedding_similarity(query, expectation):
     prediction = my_chatbot(query)
     expect.embedding_distance(
         # This step logs the distance as feedback for this run
-        prediction=prediction, expectation=expectation
+        prediction=prediction, reference=expectation
     # Adding a matcher (in this case, 'to_be_*"), logs 'expectation' feedback
     ).to_be_less_than(0.5) # Optional predicate to assert against
     expect.edit_distance(
         # This computes the normalized Damerau-Levenshtein distance between the two strings
-        prediction=prediction, expectation=expectation
+        prediction=prediction, reference=expectation
     # If no predicate is provided below, 'assert' isn't called, but the score is still logged
     )
 ```
@@ -195,8 +195,8 @@ The following metrics are available off-the-shelf:
 | -------------------- | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
 | `pass`               | Binary pass/fail score, 1 for pass, 0 for fail              | `assert False` # Fails                                                                                                |
 | `expectation`        | Binary expectation score, 1 if expectation is met, 0 if not | `expect(prediction).against(lambda x: re.search(r"\b[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\b", x)` ) |
-| `embedding_distance` | Cosine distance between two embeddings                      | expect.embedding_distance(prediction=prediction, expectation=expectation)                                             |
-| `edit_distance`      | Edit distance between two strings                           | expect.edit_distance(prediction=prediction, expectation=expectation)                                                  |
+| `embedding_distance` | Cosine distance between two embeddings                      | expect.embedding_distance(prediction=prediction, reference=expectation)                                               |
+| `edit_distance`      | Edit distance between two strings                           | expect.edit_distance(prediction=prediction, reference=expectation)                                                    |
 
 You can also log any arbitrary feeback within a unit test manually using the `client`.
 
diff --git a/versioned_docs/version-2.0/tutorials/Developers/agents.mdx b/versioned_docs/version-2.0/tutorials/Developers/agents.mdx
index ea2c371f..f53468fc 100644
--- a/versioned_docs/version-2.0/tutorials/Developers/agents.mdx
+++ b/versioned_docs/version-2.0/tutorials/Developers/agents.mdx
@@ -317,7 +317,6 @@ messages = graph.invoke(msg,config)
 messages['messages'][-1].content
 
 ## Stream
-import uuid
 _printed = set()
 thread_id = str(uuid.uuid4())
 
@@ -367,7 +366,9 @@ First, create a dataset that evaluates end-to-end performance of the agent. We c
 
 ```python
 from langsmith import Client
+
 client = Client()
+
 # Create a dataset
 examples = [
     ("Which country's customers spent the most? And how much did they spend?", "The country whose customers spent the most is the USA, with a total expenditure of $523.06"),
@@ -464,13 +465,15 @@ We can check a specific tool call using [a custom evaluator](https://docs.smith.
 - We specify the `reference` tool call for the step that we are evaluating, `expected_tool_call`.
 
 ```python
+from langsmith.schemas import Example, Run
+
 def predict_assistant(example: dict):
     """Invoke assistant for single tool call evaluation"""
     msg = [ ("user", example["input"]) ]
     result = assistant_runnable.invoke({"messages":msg})
     return {"response": result}
 
-from langsmith.schemas import Example, Run
+
 def check_specific_tool_call(root_run: Run, example: Example) -> dict:
     """
     Check if the first tool call in the response matches the expected tool call.
@@ -594,4 +597,4 @@ experiment_results = evaluate(
 
 You can see the results from the evaluations logged to the dataset!
 
-https://smith.langchain.com/public/20808486-67c3-4e30-920b-6d49d6f2b6b8/d
+[https://smith.langchain.com/public/20808486-67c3-4e30-920b-6d49d6f2b6b8/d](https://smith.langchain.com/public/20808486-67c3-4e30-920b-6d49d6f2b6b8/d)