* Triangulating Quality from Multiple Benchmark Sources
* No single benchmark tells the whole story. Arena scores reflect what
* people prefer in conversation; objective benchmarks measure raw task
* performance. A model can score well on standardized tests but feel
* robotic, or charm users while fumbling at structured tasks.
* This example combines two independent quality signals -- LMArena
* human preference and Artificial Analysis intelligence index -- so
* models that rank highly on both rise to the top. Cost and recency
* Requires ARTIFICIAL_ANALYSIS_API_KEY for AA data. LMArena is free.
fromModelsDev, recommend,
minMaxCriterion, matchesModel,
const aaKey = process.env.ARTIFICIAL_ANALYSIS_API_KEY;
console.error("Set ARTIFICIAL_ANALYSIS_API_KEY to run this example.");
const models = await fromModelsDev();
// --- Source 1: LMArena (human preference from blind comparisons) ---
const lmResponse = await fetch(
"https://raw.githubusercontent.com/nakasyou/lmarena-history/main/output/scores.json",
if (!lmResponse.ok) throw new Error(`LMArena fetch failed: ${lmResponse.status}`);
const scoresData = await lmResponse.json();
const dates = Object.keys(scoresData).sort();
const latestScores = scoresData[dates[dates.length - 1]].text.overall;
const arenaBenchmarks = Object.entries(latestScores).map(([modelId, score]) => ({
// --- Source 2: Artificial Analysis (objective benchmark composite) ---
const aaResponse = await fetch(
"https://artificialanalysis.ai/api/v2/data/llms/models",
{ headers: { "x-api-key": aaKey } },
if (!aaResponse.ok) throw new Error(`AA fetch failed: ${aaResponse.status}`);
const aaData = await aaResponse.json();
const aaBenchmarks = aaData.data
.filter((m: Record<string, unknown>) => m.evaluations)
.map((m: Record<string, unknown>) => {
const evals = m.evaluations as Record<string, number | null>;
quality: evals.artificial_analysis_intelligence_index ?? undefined,
// Enrich models with both benchmark values so they flow through to results.
// ScoredModel is generic -- recommend() preserves any extra fields you add.
type MultiScoredModel = Model & { arena?: number; quality?: number };
const multiscoredModels: MultiScoredModel[] = models.map((m) => {
const arena = arenaBenchmarks.find((b) => matchesModel(b.modelId, m.id));
const aa = aaBenchmarks.find((b: { slug: string }) => matchesModel(b.slug, m.id));
// Each benchmark source becomes one criterion. Not every model appears in
// both datasets -- unmatched models score 0 for that criterion (without
// affecting the normalization range), so models with data from both sources
// naturally rank higher.
const humanPreference = minMaxCriterion((model: MultiScoredModel) => model.arena);
const objectiveQuality = minMaxCriterion((model: MultiScoredModel) => model.quality);
// Blend human preference and objective quality equally, with cost and
// recency as tiebreakers
{ criterion: humanPreference, weight: 4 },
{ criterion: objectiveQuality, weight: 4 },
{ criterion: costEfficiency, weight: 2 },
{ criterion: recency, weight: 1 },
// Top 10 from direct providers, diverse across providers and families
filter: { providers: [...DIRECT_PROVIDERS] },
constraints: [perProvider(2), perFamily(1)],
const results = recommend(multiscoredModels, blendedProfile, selection);
console.table(results.map((m) => ({
Score: +m.score.toFixed(3),
Arena: m.arena != null ? Math.round(m.arena) : "n/a",
Quality: m.quality ?? "n/a",
Cost: m.cost?.input != null ? `$${m.cost.input}/M` : "n/a",