Evaluations (evals) are functions that score prompt outputs and determine pass/fail status.
Start with evals first - Build your evaluation framework before writing prompts. Evals provide the foundation for measuring effectiveness and iterating.
Quick Start
Create an eval function that returns {passed, score, reason}:
export const accuracy = async ({ output , expectedOutput , input }) => {
const match = output . trim () === expectedOutput . trim ();
return {
passed: match ,
score: match ? 1.0 : 0.0 ,
reason: match ? undefined : `Expected " ${ expectedOutput } ", got " ${ output } "`
};
};
from agentmark.prompt_core import EvalParams, EvalResult
def accuracy ( params : EvalParams) -> EvalResult:
output = str (params[ "output" ]).strip()
expected = str (params[ "expectedOutput" ]).strip()
match = output == expected
return {
"passed" : match,
"score" : 1.0 if match else 0.0 ,
"reason" : None if match else f 'Expected " { expected } ", got " { output } "'
}
Reference in your prompt’s frontmatter:
---
title : Sentiment Classifier
test_settings :
dataset : ./datasets/sentiment.jsonl
evals :
- accuracy
---
Defining an Eval Registry
Define eval functions as a plain object using the EvalRegistry type in your AgentMark client setup:
import type { EvalRegistry } from "@agentmark-ai/prompt-core" ;
const evalRegistry : EvalRegistry = {
accuracy : ({ output , expectedOutput }) => {
const match = output . trim () === expectedOutput ?. trim ();
return { passed: match , score: match ? 1 : 0 };
},
exact_match : ({ output , expectedOutput }) => ({
passed: output === expectedOutput ,
score: output === expectedOutput ? 1 : 0 ,
}),
relevance : ({ output , input }) => {
const relevant = output . toLowerCase (). includes ( input . query ?. toLowerCase ());
return { passed: relevant , score: relevant ? 1 : 0 };
},
length_check : ({ output }) => {
const ok = output . length >= 10 && output . length <= 500 ;
return { passed: ok , score: ok ? 1 : 0 };
},
};
List your registered eval names in agentmark.json to make them available in the platform UI:
{
"evals" : [ "accuracy" , "exact_match" , "relevance" , "length_check" ]
}
Function Signature
interface EvalParams {
input : string | Record < string , unknown > | Array < Record < string , unknown > | string >;
output : string | Record < string , unknown > | Array < Record < string , unknown > | string >;
expectedOutput ?: string ; // Maps from dataset's expected_output field
}
interface EvalResult {
score ?: number ; // Numeric score (0-1 recommended)
passed ?: boolean ; // Pass/fail status (used by --threshold)
label ?: string ; // Classification label for categorization
reason ?: string ; // Explanation for the result
}
type EvalFunction = ( params : EvalParams ) => Promise < EvalResult > | EvalResult ;
from typing import Any, TypedDict, Callable, Awaitable
class EvalParams ( TypedDict ):
input : str | dict[ str , Any] | list[dict[ str , Any] | str ]
output: str | dict[ str , Any] | list[dict[ str , Any] | str ]
expectedOutput: str | None # Note: camelCase in Python
class EvalResult ( TypedDict , total = False ):
passed: bool # Pass/fail status
score: float # Numeric score (0-1)
reason: str # Explanation for failure
label: str # Custom label for categorization
# Both sync and async functions are supported
EvalFunction = Callable[[EvalParams], EvalResult | Awaitable[EvalResult]]
Registering Evals
Define eval functions as a plain object and pass it to your AgentMark client:
import type { EvalRegistry } from "@agentmark-ai/prompt-core" ;
const evalRegistry : EvalRegistry = {
accuracy: accuracy ,
contains_keyword: containsKeyword ,
};
// Pass to your client
const client = createAgentMarkClient ({
loader ,
modelRegistry ,
evalRegistry ,
});
from agentmark.prompt_core import EvalRegistry
from agentmark_pydantic_ai_v0 import (
create_pydantic_ai_client,
create_default_model_registry,
)
from agentmark.prompt_core import FileLoader
# Define eval registry as a plain dict
eval_registry: EvalRegistry = {
"accuracy" : accuracy,
"contains_keyword" : contains_keyword,
}
# Create client with eval registry
client = create_pydantic_ai_client(
model_registry = create_default_model_registry(),
loader = FileLoader( base_dir = "./" ),
eval_registry = eval_registry,
)
from agentmark.prompt_core import EvalRegistry
from agentmark_claude_agent_sdk import (
create_claude_agent_client,
ClaudeAgentModelRegistry,
ClaudeAgentAdapterOptions,
)
# Define eval registry as a plain dict
eval_registry: EvalRegistry = {
"accuracy" : accuracy,
}
# Create client with eval registry
client = create_claude_agent_client(
model_registry = ClaudeAgentModelRegistry.create_default(),
adapter_options = ClaudeAgentAdapterOptions(
permission_mode = "bypassPermissions" ,
),
eval_registry = eval_registry,
)
EvalRegistry type
EvalRegistry is a type alias for a plain object mapping eval names to functions. In TypeScript it is Record<string, EvalFunction>, and in Python it is Dict[str, EvalFunction]. You use standard object/dict operations to work with it:
import type { EvalRegistry } from "@agentmark-ai/prompt-core" ;
const evalRegistry : EvalRegistry = {
accuracy: accuracyFn ,
relevance: relevanceFn ,
};
// Standard object operations
evalRegistry [ "new_eval" ] = newEvalFn ; // Add an eval
const fn = evalRegistry [ "accuracy" ]; // Get an eval function
const exists = "accuracy" in evalRegistry ; // Check if exists
delete evalRegistry [ "accuracy" ]; // Remove an eval
const names = Object . keys ( evalRegistry ); // List registered names
from agentmark.prompt_core import EvalRegistry
eval_registry: EvalRegistry = {
"accuracy" : accuracy_fn,
"relevance" : relevance_fn,
}
# Standard dict operations
eval_registry[ "new_eval" ] = new_eval_fn # Add an eval
fn = eval_registry[ "accuracy" ] # Get an eval function
exists = "accuracy" in eval_registry # Check if exists
del eval_registry[ "accuracy" ] # Remove an eval
names = list (eval_registry.keys()) # List registered names
All fields in EvalResult are optional. Return whichever fields are relevant to your eval. The passed field is used by the CLI --threshold flag to calculate pass rates.
Evaluation Types
Reference-Based (Ground Truth)
Compare outputs against known correct answers:
export const exact_match = async ({ output , expectedOutput }) => {
return {
passed: output === expectedOutput ,
score: output === expectedOutput ? 1 : 0
};
};
Use for: Classification, extraction, math problems, multiple choice
Reference-Free (Heuristic)
Check structural requirements without ground truth:
export const has_required_fields = async ({ output }) => {
const required = [ 'name' , 'email' , 'summary' ];
const hasAll = required . every ( field => output [ field ]);
return {
passed: hasAll ,
score: hasAll ? 1 : 0 ,
reason: hasAll ? undefined : 'Missing required fields'
};
};
Use for: Format validation, length checks, required content
Model-Graded (LLM-as-Judge)
Use an LLM to evaluate subjective criteria:
import { generateObject } from 'ai' ;
import { openai } from '@ai-sdk/openai' ;
import { z } from 'zod' ;
export const tone_eval = async ({ output , expectedOutput }) => {
const { object } = await generateObject ({
model: openai ( 'gpt-4o-mini' ),
schema: z . object ({ passed: z . boolean (), reasoning: z . string () }),
prompt: `Evaluate if this response has appropriate ${ expectedOutput } tone: \n\n ${ output } ` ,
temperature: 0.1 ,
});
return {
passed: object . passed ,
score: object . passed ? 1 : 0 ,
reason: object . reasoning ,
};
};
Use for: Tone, creativity, helpfulness, semantic similarity
Combine approaches - Use reference-based for correctness, reference-free for structure, and model-graded for subjective quality.
Common Patterns
Classification
export const classification_accuracy = async ({ output , expectedOutput }) => {
const match = output . trim (). toLowerCase () === expectedOutput . trim (). toLowerCase ();
return {
passed: match ,
score: match ? 1 : 0 ,
reason: match ? undefined : `Expected ${ expectedOutput } , got ${ output } `
};
};
def classification_accuracy ( params : EvalParams) -> EvalResult:
output = str (params[ "output" ]).strip().lower()
expected = str (params[ "expectedOutput" ]).strip().lower()
match = output == expected
return {
"passed" : match,
"score" : 1.0 if match else 0.0 ,
"reason" : None if match else f "Expected { expected } , got { output } "
}
Contains Keyword
export const contains_keyword = async ({ output , expectedOutput }) => {
const contains = output . includes ( expectedOutput );
return {
passed: contains ,
score: contains ? 1 : 0 ,
reason: contains ? undefined : `Output missing " ${ expectedOutput } "`
};
};
def contains_keyword ( params : EvalParams) -> EvalResult:
output = str (params[ "output" ])
expected = str (params[ "expectedOutput" ])
contains = expected in output
return {
"passed" : contains,
"score" : 1.0 if contains else 0.0 ,
"reason" : None if contains else f 'Output missing " { expected } "'
}
Field Presence
export const required_fields = async ({ output }) => {
const required = [ 'name' , 'email' , 'message' ];
const missing = required . filter ( field => ! ( field in output ));
return {
passed: missing . length === 0 ,
score: ( required . length - missing . length ) / required . length ,
reason: missing . length > 0 ? `Missing: ${ missing . join ( ', ' ) } ` : undefined
};
};
Length Check
export const length_check = async ({ output }) => {
const length = output . length ;
const passed = length >= 10 && length <= 500 ;
return {
passed ,
score: passed ? 1 : 0 ,
reason: passed ? undefined : `Length ${ length } outside range [10, 500]`
};
};
Format Validation
export const email_format = async ({ output }) => {
const emailRegex = / ^ [ ^ \s@ ] + @ [ ^ \s@ ] + \. [ ^ \s@ ] + $ / ;
const passed = emailRegex . test ( output );
return {
passed ,
score: passed ? 1 : 0 ,
reason: passed ? undefined : 'Invalid email format'
};
};
Graduated Scoring
Use label field to categorize results:
export const sentiment_gradual = async ({ output , expectedOutput }) => {
if ( output === expectedOutput ) {
return { passed: true , score: 1.0 , label: 'exact_match' };
}
const partialMatches = {
'positive' : [ 'very positive' , 'somewhat positive' ],
'negative' : [ 'very negative' , 'somewhat negative' ]
};
if ( partialMatches [ expectedOutput ]?. includes ( output )) {
return {
passed: true ,
score: 0.7 ,
label: 'partial_match' ,
reason: 'Close semantic match'
};
}
return {
passed: false ,
score: 0 ,
label: 'no_match' ,
reason: `Expected ${ expectedOutput } , got ${ output } `
};
};
Filter by label (exact_match, partial_match, no_match) to understand patterns.
LLM-as-Judge
Using AgentMark Prompts (Recommended)
1. Create eval prompt (agentmark/evals/tone-judge.prompt.mdx):
---
object_config :
model_name : gpt-4o-mini
temperature : 0.1
schema :
type : object
properties :
passed :
type : boolean
reasoning :
type : string
---
< System >
You are evaluating whether an AI response has appropriate professional tone.
First explain your reasoning step-by-step, then provide your final judgment.
</ System >
< User >
** Output to evaluate: **
{ props . output }
** Expected tone: **
{ props . expectedOutput }
</ User >
2. Use in eval function :
import { client } from './agentmark-client' ;
import { generateObject } from 'ai' ;
export const tone_check = async ({ output , expectedOutput }) => {
const evalPrompt = await client . loadObjectPrompt ( 'evals/tone-judge.prompt.mdx' );
const formatted = await evalPrompt . format ({
props: { output , expectedOutput }
});
const { object } = await generateObject ( formatted );
return {
passed: object . passed ,
score: object . passed ? 1 : 0 ,
reason: object . reasoning ,
};
};
Benefits : Version control eval logic, iterate independently, reuse prompts, leverage templating.
Best Practices
Configuration :
Use low temperature (0.1-0.3) for consistency
Ask for reasoning before judgment (chain-of-thought)
Use binary scoring (PASS/FAIL) not scales (1-10)
Test one dimension at a time
Model selection :
Use stronger model to grade weaker models (GPT-4 → GPT-3.5)
Avoid grading a model with itself
Validate with human evaluation before scaling
Usage :
Use sparingly - slower and more expensive
Reserve for subjective criteria
Watch for position bias, verbosity bias, self-enhancement bias
Avoid exact-match for open-ended outputs - Use only for classification or short outputs. For longer text, use semantic similarity or LLM-based evaluation.
Domain-Specific Evals
RAG (Retrieval-Augmented Generation)
export const faithfulness = async ({ output , input }) => {
const context = input . retrieved_context ;
const claims = extractClaims ( output );
const supported = claims . every ( claim => isSupported ( claim , context ));
return {
passed: supported ,
score: supported ? 1 : 0 ,
reason: supported ? undefined : 'Output contains unsupported claims'
};
};
export const answer_relevancy = async ({ output , input }) => {
const isRelevant = output . toLowerCase (). includes ( input . query . toLowerCase ());
return {
passed: isRelevant ,
score: isRelevant ? 1 : 0 ,
reason: isRelevant ? undefined : 'Answer not relevant to query'
};
};
Agent/Tool Calling
export const tool_correctness = async ({ output , expectedOutput }) => {
const correctTool = output . tool === expectedOutput . tool ;
const correctParams = JSON . stringify ( output . parameters ) ===
JSON . stringify ( expectedOutput . parameters );
return {
passed: correctTool && correctParams ,
score: correctTool && correctParams ? 1 : 0.5 ,
reason: ! correctTool ? 'Wrong tool selected' :
! correctParams ? 'Incorrect parameters' : undefined
};
};
Best Practices
Test one thing per eval - separate functions for different criteria
Provide helpful failure reasons for debugging
Use meaningful names (sentiment_accuracy not eval1)
Keep scores in 0-1 range
Make evals deterministic and consistent (avoid flaky tests)
Validate general behavior, not specific outputs (avoid overfitting)
Next Steps
Datasets Create test datasets
Running Experiments Run your evaluations
Testing Overview Learn testing concepts