Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/evaluators/code-evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ export class CodeEvaluator implements Evaluator {
reasoning,
evaluatorRawRequest,
...(details ? { details } : {}),
tokenUsage: proxyUsage?.tokenUsage,
};
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
Expand All @@ -178,6 +179,7 @@ export class CodeEvaluator implements Evaluator {
: {}),
error: message,
},
tokenUsage: proxyUsage?.tokenUsage,
};
} finally {
// Always shut down the proxy when done
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/evaluators/composite.ts
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ export class CompositeEvaluator implements Evaluator {
evaluatorRawRequest: member.result.evaluatorRawRequest,
scores: member.result.scores,
details: member.result.details,
tokenUsage: member.result.tokenUsage,
});
}

Expand Down Expand Up @@ -173,6 +174,7 @@ export class CompositeEvaluator implements Evaluator {
evaluatorRawRequest: member.result.evaluatorRawRequest,
scores: member.result.scores,
details: member.result.details,
tokenUsage: member.result.tokenUsage,
});
}

Expand Down
25 changes: 17 additions & 8 deletions packages/core/src/evaluation/evaluators/llm-judge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { z } from 'zod';
import type { Provider, ProviderResponse } from '../providers/types.js';
import { extractLastAssistantContent } from '../providers/types.js';
import { TEMPLATE_VARIABLES } from '../template-variables.js';
import type { TokenUsage } from '../trace.js';
import type { JsonObject, RubricItem } from '../types.js';
import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js';
import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
Expand Down Expand Up @@ -148,7 +149,7 @@ export class LlmJudgeEvaluator implements Evaluator {
};

try {
const { data } = await this.runWithRetry({
const { data, tokenUsage } = await this.runWithRetry({
context,
judgeProvider,
systemPrompt,
Expand All @@ -173,6 +174,7 @@ export class LlmJudgeEvaluator implements Evaluator {
expectedAspectCount,
reasoning,
evaluatorRawRequest,
tokenUsage,
};
} catch {
// Deliberate: parse failures yield score 0 silently — no warning emitted,
Expand Down Expand Up @@ -215,7 +217,7 @@ export class LlmJudgeEvaluator implements Evaluator {
target: judgeProvider.targetName,
};

const { data } = await this.runWithRetry({
const { data, tokenUsage } = await this.runWithRetry({
context,
judgeProvider,
systemPrompt,
Expand All @@ -233,6 +235,7 @@ export class LlmJudgeEvaluator implements Evaluator {
expectedAspectCount: rubrics.length,
reasoning: data.overall_reasoning,
evaluatorRawRequest,
tokenUsage,
};
}

Expand All @@ -254,7 +257,7 @@ export class LlmJudgeEvaluator implements Evaluator {
target: judgeProvider.targetName,
};

const { data } = await this.runWithRetry({
const { data, tokenUsage } = await this.runWithRetry({
context,
judgeProvider,
systemPrompt,
Expand All @@ -273,6 +276,7 @@ export class LlmJudgeEvaluator implements Evaluator {
reasoning: data.overall_reasoning,
evaluatorRawRequest,
details,
tokenUsage,
};
}

Expand Down Expand Up @@ -389,7 +393,7 @@ export class LlmJudgeEvaluator implements Evaluator {
readonly systemPrompt: string;
readonly userPrompt: string;
readonly schema: z.ZodSchema<T>;
}): Promise<{ data: T; providerResponse?: ProviderResponse }> {
}): Promise<{ data: T; providerResponse?: ProviderResponse; tokenUsage?: TokenUsage }> {
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;

let lastError: Error | undefined;
Expand All @@ -399,16 +403,21 @@ export class LlmJudgeEvaluator implements Evaluator {
// Prefer Vercel AI SDK language model if available.
const model = judgeProvider.asLanguageModel?.();
if (model) {
const { text } = await generateText({
const result = await generateText({
model,
system: systemPrompt,
prompt: userPrompt,
...(this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {}),
...(typeof this.temperature === 'number' ? { temperature: this.temperature } : {}),
});

const data = schema.parse(parseJsonFromText(text));
return { data };
const data = schema.parse(parseJsonFromText(result.text));
const rawUsage = result.usage;
const tokenUsage =
rawUsage?.inputTokens != null && rawUsage?.outputTokens != null
? { input: rawUsage.inputTokens, output: rawUsage.outputTokens }
: undefined;
return { data, tokenUsage };
}

const response = await judgeProvider.invoke({
Expand All @@ -421,7 +430,7 @@ export class LlmJudgeEvaluator implements Evaluator {
});

const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
return { data, providerResponse: response };
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
} catch (e: unknown) {
lastError = e instanceof Error ? e : new Error(String(e));
}
Expand Down
6 changes: 5 additions & 1 deletion packages/core/src/evaluation/evaluators/types.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import type { ResolvedTarget } from '../providers/targets.js';
import type { ChatPrompt, Message, Provider } from '../providers/types.js';
import type { TraceSummary } from '../trace.js';
import type { TokenUsage, TraceSummary } from '../trace.js';
import type { EvalTest, EvaluationVerdict, EvaluatorConfig, JsonObject } from '../types.js';

export type { EvaluationVerdict };
Expand Down Expand Up @@ -52,6 +52,8 @@ export interface EvaluationScore {
readonly scores?: readonly ChildEvaluatorResult[];
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
readonly details?: JsonObject;
/** Token usage from LLM calls made by this evaluator (optional). */
readonly tokenUsage?: TokenUsage;
}

export interface ChildEvaluatorResult {
Expand All @@ -67,6 +69,8 @@ export interface ChildEvaluatorResult {
readonly scores?: readonly ChildEvaluatorResult[];
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
readonly details?: JsonObject;
/** Token usage from LLM calls made by this evaluator (optional). */
readonly tokenUsage?: TokenUsage;
}

export interface Evaluator {
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,7 @@ async function runEvaluatorList(options: {
evaluatorProviderRequest: score.evaluatorRawRequest,
details: score.details,
scores: mapChildResults(score.scores),
tokenUsage: score.tokenUsage,
});
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
Expand Down Expand Up @@ -1852,6 +1853,7 @@ function mapChildResults(
evaluatorProviderRequest: child.evaluatorRawRequest,
scores: mapChildResults(child.scores),
details: child.details,
tokenUsage: child.tokenUsage,
}));
}

Expand Down
9 changes: 8 additions & 1 deletion packages/core/src/evaluation/providers/ai-sdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,17 @@ async function invokeModel(options: {

function mapResponse(result: TextResult): ProviderResponse {
const content = result.text ?? '';
const rawUsage = result.totalUsage ?? result.usage;
const tokenUsage =
rawUsage?.inputTokens != null && rawUsage?.outputTokens != null
? { input: rawUsage.inputTokens, output: rawUsage.outputTokens }
: undefined;

return {
raw: result,
usage: toJsonObject(result.totalUsage ?? result.usage),
usage: toJsonObject(rawUsage),
output: [{ role: 'assistant' as const, content }],
tokenUsage,
};
}

Expand Down
4 changes: 3 additions & 1 deletion packages/core/src/evaluation/types.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { ToolTrajectoryEvaluatorConfig, TraceSummary } from './trace.js';
import type { TokenUsage, ToolTrajectoryEvaluatorConfig, TraceSummary } from './trace.js';

/**
* JSON primitive values appearing in AgentV payloads.
Expand Down Expand Up @@ -736,6 +736,8 @@ export interface EvaluatorResult {
readonly scores?: readonly EvaluatorResult[];
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
readonly details?: JsonObject;
/** Token usage from LLM calls made by this evaluator (optional). */
readonly tokenUsage?: TokenUsage;
}

/**
Expand Down
21 changes: 21 additions & 0 deletions packages/core/src/runtime/target-proxy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { type IncomingMessage, type Server, type ServerResponse, createServer }
import type { AddressInfo } from 'node:net';

import type { Provider } from '../evaluation/providers/types.js';
import type { TokenUsage } from '../evaluation/trace.js';

/**
* Request body for /invoke endpoint
Expand All @@ -32,6 +33,7 @@ export interface TargetProxyInvokeRequest {
export interface TargetProxyInvokeResponse {
readonly output: readonly unknown[];
readonly rawText?: string;
readonly tokenUsage?: TokenUsage;
}

/**
Expand All @@ -40,6 +42,7 @@ export interface TargetProxyInvokeResponse {
export interface TargetProxyUsageMetadata {
readonly callCount: number;
readonly maxCalls: number;
readonly tokenUsage?: TokenUsage;
}

/**
Expand Down Expand Up @@ -93,6 +96,8 @@ export async function createTargetProxy(options: TargetProxyOptions): Promise<Ta

let callCount = 0;
let isShutdown = false;
let totalInputTokens = 0;
let totalOutputTokens = 0;

// Build available targets list - always includes default
const targetsList: readonly string[] = availableTargets ?? [defaultProvider.targetName];
Expand Down Expand Up @@ -202,13 +207,19 @@ export async function createTargetProxy(options: TargetProxyOptions): Promise<Ta
attempt: request.attempt ?? 1,
});

if (response.tokenUsage) {
totalInputTokens += response.tokenUsage.input;
totalOutputTokens += response.tokenUsage.output;
}

// Extract output messages and rawText
const output = response.output ?? [];
const rawText = extractLastAssistantContent(output);

const result: TargetProxyInvokeResponse = {
output,
rawText,
tokenUsage: response.tokenUsage,
};

sendJson(res, 200, result);
Expand Down Expand Up @@ -267,10 +278,16 @@ export async function createTargetProxy(options: TargetProxyOptions): Promise<Ta
attempt: request.attempt ?? 1,
});

if (response.tokenUsage) {
totalInputTokens += response.tokenUsage.input;
totalOutputTokens += response.tokenUsage.output;
}

const output = response.output ?? [];
responses.push({
output,
rawText: extractLastAssistantContent(output),
tokenUsage: response.tokenUsage,
});
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
Expand Down Expand Up @@ -315,6 +332,10 @@ export async function createTargetProxy(options: TargetProxyOptions): Promise<Ta
getUsageMetadata: () => ({
callCount,
maxCalls,
tokenUsage:
totalInputTokens > 0 || totalOutputTokens > 0
? { input: totalInputTokens, output: totalOutputTokens }
: undefined,
}),
};
}
Expand Down
Loading