langchain-ai · jacoblee93 · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025
diff --git a/README.md b/README.md
@@ -1248,6 +1248,119 @@ console.log(result);
 ```
 </details>
 
+## Agent wrappers
+
+The `agentevals` package also includes agent wrappers that help implement "in-the-loop" style evaluation, where the evaluator runs *as part* of your agent and shapes its trajectory and response.
+
+### Reflection
+
+The `wrap_agent_with_reflection` function wraps a LangGraph agent with a reflection loop. When the agent finishes running, it passes its result and trajectory to an evaluator. If the evaluator fails, it adds a user message detailing the failure and prompts the agent to fix its mistakes and try again. Otherwise, it returns as normal.
+
+You can pass your own evaluator into `wrap_agent_with_reflection`, but if you don't the wrapper will automatically generate criteria for success based on the input and use an LLM-as-judge evaluator to grade the agent's trajectory and output (as shown in the example below):
+
+```python
+import math
+
+from agentevals.wrappers.reflection import wrap_agent_with_reflection
+
+from langchain.chat_models import init_chat_model
+from langchain_core.tools import tool
+
+from langgraph.prebuilt import create_react_agent
+
+
+@tool
+def add(a: float, b: float) -> float:
+    """Add two numbers together."""
+    return a + b
+
+
+@tool
+def multiply(a: float, b: float) -> float:
+    """Multiply two numbers together."""
+    return a * b
+
+
+@tool
+def divide(a: float, b: float) -> float:
+    """Divide two numbers."""
+    return a / b
+
+
+@tool
+def subtract(a: float, b: float) -> float:
+    """Subtract two numbers."""
+    return a - b
+
+
+@tool
+def sin(a: float) -> float:
+    """Take the sine of a number."""
+    return math.sin(a)
+
+
+@tool
+def cos(a: float) -> float:
+    """Take the cosine of a number."""
+    return math.cos(a)
+
+
+@tool
+def radians(a: float) -> float:
+    """Convert degrees to radians."""
+    return math.radians(a)
+
+
+@tool
+def exponentiation(a: float, b: float) -> float:
+    """Raise one number to the power of another."""
+    return a**b
+
+
+@tool
+def sqrt(a: float) -> float:
+    """Take the square root of a number."""
+    return math.sqrt(a)
+
+
+@tool
+def ceil(a: float) -> float:
+    """Round a number up to the nearest integer."""
+    return math.ceil(a)
+
+# Initialize agent
+llm = init_chat_model("openai:gpt-4o", temperature=0.1)
+tools = [
+    sin,
+    cos,
+    radians,
+    ceil,
+    exponentiation,
+    sqrt,
+    add,
+    multiply,
+    divide,
+    subtract,
+]
+
+agent = wrap_agent_with_reflection(agent=create_react_agent(llm, tools))
+
+query = (
+    "A batter hits a baseball at 45.847 m/s at an angle of "
+    "23.474° above the horizontal. The outfielder, who starts facing the batter, picks up the baseball as it lands, "
+    "then throws it back towards the batter at 24.12 m/s at an angle of 39.12 degrees. "
+    "How far is the baseball from where the batter originally hit it? "
+    "Assume zero air resistance."
+)
+
+for step in agent.stream(
+    {"messages": query}, stream_mode="updates", config={"recursion_limit": 50}
+):
+    for _, update in step.items():
+        for message in update.get("messages", []):
+            message.pretty_print()
+```
+
 ## Python Async Support
 
 All `agentevals` evaluators support Python [asyncio](https://docs.python.org/3/library/asyncio.html). As a convention, evaluators that use a factory function will have `async` put immediately after `create_` in the function name (for example, `create_async_trajectory_llm_as_judge`), and evaluators used directly will end in `async` (e.g. `trajectory_strict_match_async`).

diff --git a/python/agentevals/trajectory/llm.py b/python/agentevals/trajectory/llm.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-import json
 
 from openevals.llm import (
     _create_llm_as_judge_scorer,
@@ -36,13 +35,17 @@
   - Is semantically equivalent to the provided reference trajectory
 </Rubric>
 
-Grade the following trajectory:
+Based on the following reference trajectory:
+
+<reference_trajectory>
+{reference_outputs}
+</reference_trajectory>
+
+Grade this actual trajectory:
 
 <trajectory>
 {outputs}
 </trajectory>
-{inputs}
-{reference_outputs}
 """
 
 TRAJECTORY_ACCURACY_PROMPT = """You are an expert data labeler.
@@ -65,7 +68,6 @@
 <trajectory>
 {outputs}
 </trajectory>
-{inputs}
 """
 
 if TYPE_CHECKING:
@@ -89,14 +91,11 @@ def _format_inputs(
         formatted_inputs = f"\nThe agent generated the trajectory from the following input:\n<input>\n{inputs}\n</input>\n"
     else:
         formatted_inputs = ""
-    if isinstance(outputs, dict):
-        formatted_outputs = json.dumps(outputs)
-    else:
-        formatted_outputs = _chat_completion_messages_to_string(outputs)
+    formatted_outputs = _chat_completion_messages_to_string(outputs)
     return (
+        formatted_inputs,
         formatted_outputs,
         formatted_reference_outputs,
-        formatted_inputs,
     )
 
 
@@ -163,32 +162,11 @@ def _wrapped_evaluator(
         ] = None,
         **kwargs,
     ) -> EvaluatorResult:
-        if prompt == TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE:
-            if reference_outputs is None:
-                raise ValueError(
-                    "TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE requires reference_outputs to compare against"
-                )
-            (
-                formatted_outputs,
-                formatted_reference_outputs,
-                formatted_inputs,
-            ) = _format_inputs(inputs, outputs, reference_outputs)
-        elif prompt == TRAJECTORY_ACCURACY_PROMPT:
-            if reference_outputs is not None:
-                raise ValueError(
-                    "TRAJECTORY_ACCURACY_PROMPT requires reference_outputs to be None"
-                )
-            (
-                formatted_outputs,
-                formatted_reference_outputs,
-                formatted_inputs,
-            ) = _format_inputs(inputs, outputs, reference_outputs)
-        else:
-            formatted_outputs = _normalize_to_openai_messages_list(outputs)
-            formatted_reference_outputs = _normalize_to_openai_messages_list(
-                reference_outputs
-            )
-            formatted_inputs = inputs
+        (
+            formatted_inputs,
+            formatted_outputs,
+            formatted_reference_outputs,
+        ) = _format_inputs(inputs, outputs, reference_outputs)
         return _run_evaluator(
             run_name=f"llm_as_{feedback_key}_judge",
             scorer=scorer,
@@ -265,32 +243,11 @@ async def _wrapped_evaluator(
         ] = None,
         **kwargs,
     ) -> EvaluatorResult:
-        if prompt == TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE:
-            if reference_outputs is None:
-                raise ValueError(
-                    "TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE requires reference_outputs to compare against"
-                )
-            (
-                formatted_outputs,
-                formatted_reference_outputs,
-                formatted_inputs,
-            ) = _format_inputs(inputs, outputs, reference_outputs)
-        elif prompt == TRAJECTORY_ACCURACY_PROMPT:
-            if reference_outputs is not None:
-                raise ValueError(
-                    "TRAJECTORY_ACCURACY_PROMPT requires reference_outputs to be None"
-                )
-            (
-                formatted_outputs,
-                formatted_reference_outputs,
-                formatted_inputs,
-            ) = _format_inputs(inputs, outputs, reference_outputs)
-        else:
-            formatted_outputs = _normalize_to_openai_messages_list(outputs)
-            formatted_reference_outputs = _normalize_to_openai_messages_list(
-                reference_outputs
-            )
-            formatted_inputs = inputs
+        (
+            formatted_inputs,
+            formatted_outputs,
+            formatted_reference_outputs,
+        ) = _format_inputs(inputs, outputs, reference_outputs)
         return await _arun_evaluator(
             run_name=f"llm_as_{feedback_key}_judge",
             scorer=scorer,

diff --git a/python/agentevals/wrappers/__init__.py b/python/agentevals/wrappers/__init__.py