Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jacob/wrappers #10

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1248,6 +1248,119 @@ console.log(result);
```
</details>

## Agent wrappers

The `agentevals` package also includes agent wrappers that help implement "in-the-loop" style evaluation, where the evaluator runs *as part* of your agent and shapes its trajectory and response.

### Reflection

The `wrap_agent_with_reflection` function wraps a LangGraph agent with a reflection loop. When the agent finishes running, it passes its result and trajectory to an evaluator. If the evaluator fails, it adds a user message detailing the failure and prompts the agent to fix its mistakes and try again. Otherwise, it returns as normal.

You can pass your own evaluator into `wrap_agent_with_reflection`, but if you don't the wrapper will automatically generate criteria for success based on the input and use an LLM-as-judge evaluator to grade the agent's trajectory and output (as shown in the example below):

```python
import math

from agentevals.wrappers.reflection import wrap_agent_with_reflection

from langchain.chat_models import init_chat_model
from langchain_core.tools import tool

from langgraph.prebuilt import create_react_agent


@tool
def add(a: float, b: float) -> float:
"""Add two numbers together."""
return a + b


@tool
def multiply(a: float, b: float) -> float:
"""Multiply two numbers together."""
return a * b


@tool
def divide(a: float, b: float) -> float:
"""Divide two numbers."""
return a / b


@tool
def subtract(a: float, b: float) -> float:
"""Subtract two numbers."""
return a - b


@tool
def sin(a: float) -> float:
"""Take the sine of a number."""
return math.sin(a)


@tool
def cos(a: float) -> float:
"""Take the cosine of a number."""
return math.cos(a)


@tool
def radians(a: float) -> float:
"""Convert degrees to radians."""
return math.radians(a)


@tool
def exponentiation(a: float, b: float) -> float:
"""Raise one number to the power of another."""
return a**b


@tool
def sqrt(a: float) -> float:
"""Take the square root of a number."""
return math.sqrt(a)


@tool
def ceil(a: float) -> float:
"""Round a number up to the nearest integer."""
return math.ceil(a)

# Initialize agent
llm = init_chat_model("openai:gpt-4o", temperature=0.1)
tools = [
sin,
cos,
radians,
ceil,
exponentiation,
sqrt,
add,
multiply,
divide,
subtract,
]

agent = wrap_agent_with_reflection(agent=create_react_agent(llm, tools))

query = (
"A batter hits a baseball at 45.847 m/s at an angle of "
"23.474° above the horizontal. The outfielder, who starts facing the batter, picks up the baseball as it lands, "
"then throws it back towards the batter at 24.12 m/s at an angle of 39.12 degrees. "
"How far is the baseball from where the batter originally hit it? "
"Assume zero air resistance."
)

for step in agent.stream(
{"messages": query}, stream_mode="updates", config={"recursion_limit": 50}
):
for _, update in step.items():
for message in update.get("messages", []):
message.pretty_print()
```

## Python Async Support

All `agentevals` evaluators support Python [asyncio](https://docs.python.org/3/library/asyncio.html). As a convention, evaluators that use a factory function will have `async` put immediately after `create_` in the function name (for example, `create_async_trajectory_llm_as_judge`), and evaluators used directly will end in `async` (e.g. `trajectory_strict_match_async`).
Expand Down
81 changes: 19 additions & 62 deletions python/agentevals/trajectory/llm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from __future__ import annotations
import json

from openevals.llm import (
_create_llm_as_judge_scorer,
Expand Down Expand Up @@ -36,13 +35,17 @@
- Is semantically equivalent to the provided reference trajectory
</Rubric>

Grade the following trajectory:
Based on the following reference trajectory:

<reference_trajectory>
{reference_outputs}
</reference_trajectory>

Grade this actual trajectory:

<trajectory>
{outputs}
</trajectory>
{inputs}
{reference_outputs}
"""

TRAJECTORY_ACCURACY_PROMPT = """You are an expert data labeler.
Expand All @@ -65,7 +68,6 @@
<trajectory>
{outputs}
</trajectory>
{inputs}
"""

if TYPE_CHECKING:
Expand All @@ -89,14 +91,11 @@ def _format_inputs(
formatted_inputs = f"\nThe agent generated the trajectory from the following input:\n<input>\n{inputs}\n</input>\n"
else:
formatted_inputs = ""
if isinstance(outputs, dict):
formatted_outputs = json.dumps(outputs)
else:
formatted_outputs = _chat_completion_messages_to_string(outputs)
formatted_outputs = _chat_completion_messages_to_string(outputs)
return (
formatted_inputs,
formatted_outputs,
formatted_reference_outputs,
formatted_inputs,
)


Expand Down Expand Up @@ -163,32 +162,11 @@ def _wrapped_evaluator(
] = None,
**kwargs,
) -> EvaluatorResult:
if prompt == TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE:
if reference_outputs is None:
raise ValueError(
"TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE requires reference_outputs to compare against"
)
(
formatted_outputs,
formatted_reference_outputs,
formatted_inputs,
) = _format_inputs(inputs, outputs, reference_outputs)
elif prompt == TRAJECTORY_ACCURACY_PROMPT:
if reference_outputs is not None:
raise ValueError(
"TRAJECTORY_ACCURACY_PROMPT requires reference_outputs to be None"
)
(
formatted_outputs,
formatted_reference_outputs,
formatted_inputs,
) = _format_inputs(inputs, outputs, reference_outputs)
else:
formatted_outputs = _normalize_to_openai_messages_list(outputs)
formatted_reference_outputs = _normalize_to_openai_messages_list(
reference_outputs
)
formatted_inputs = inputs
(
formatted_inputs,
formatted_outputs,
formatted_reference_outputs,
) = _format_inputs(inputs, outputs, reference_outputs)
return _run_evaluator(
run_name=f"llm_as_{feedback_key}_judge",
scorer=scorer,
Expand Down Expand Up @@ -265,32 +243,11 @@ async def _wrapped_evaluator(
] = None,
**kwargs,
) -> EvaluatorResult:
if prompt == TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE:
if reference_outputs is None:
raise ValueError(
"TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE requires reference_outputs to compare against"
)
(
formatted_outputs,
formatted_reference_outputs,
formatted_inputs,
) = _format_inputs(inputs, outputs, reference_outputs)
elif prompt == TRAJECTORY_ACCURACY_PROMPT:
if reference_outputs is not None:
raise ValueError(
"TRAJECTORY_ACCURACY_PROMPT requires reference_outputs to be None"
)
(
formatted_outputs,
formatted_reference_outputs,
formatted_inputs,
) = _format_inputs(inputs, outputs, reference_outputs)
else:
formatted_outputs = _normalize_to_openai_messages_list(outputs)
formatted_reference_outputs = _normalize_to_openai_messages_list(
reference_outputs
)
formatted_inputs = inputs
(
formatted_inputs,
formatted_outputs,
formatted_reference_outputs,
) = _format_inputs(inputs, outputs, reference_outputs)
return await _arun_evaluator(
run_name=f"llm_as_{feedback_key}_judge",
scorer=scorer,
Expand Down
Empty file.
Loading
Loading