Skip to content

Commit

Permalink
langtrace eval (#237)
Browse files Browse the repository at this point in the history
  • Loading branch information
gyliu513 authored Dec 18, 2024
1 parent db1df0a commit 12a6c1b
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ newrelic/.env
otel/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/__pycache__/
otel/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/otel_lib/__pycache__/
__pycache__
logs/
32 changes: 32 additions & 0 deletions langtrace/example_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from inspect_ai import Task, task
from inspect_ai.dataset import csv_dataset
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import self_critique, generate

@task
def example_eval():
try:
dataset = csv_dataset("langtracefs://cm4lrz7tq00075jmgkdtlq6w4")
plan = [
generate(),
self_critique(model="openai/gpt-4o")
]
scorer = model_graded_fact()

return Task(
dataset=dataset,
plan=plan,
scorer=scorer
)
except Exception as e:
print(f"An error occurred: {e}")
return None



'''
inspect eval example_eval.py --model openai/gpt-3.5-turbo --log-dir langtracefs://cm4lrz7tq00075jmgkdtlq6w4
inspect eval example_eval.py --model openai/gpt-4o-mini --log-dir langtracefs://cm4lrz7tq00075jmgkdtlq6w4
inspect eval example_eval.py --model anthropic/claude-3-5-sonnet-20240620 --log-dir langtracefs://cm4lrz7tq00075jmgkdtlq6w4
inspect eval example_eval.py --model ollama/llama3.1 --log-dir langtracefs://cm4lrz7tq00075jmgkdtlq6w4
'''
22 changes: 22 additions & 0 deletions langtrace/hello.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import exact
from inspect_ai.solver import generate

# This is the simplest possible Inspect eval, useful for testing your configuration / network / platform etc.


@task
def hello_world():
return Task(
dataset=[
Sample(
input="Just reply with Hello World",
target="Hello World",
)
],
solver=[
generate(),
],
scorer=exact(),
)

0 comments on commit 12a6c1b

Please sign in to comment.