[{"data":1,"prerenderedAt":1082},["ShallowReactive",2],{"navigation":3,"\u002Fnews\u002Finsights\u002Fllm-judge-correlated-errors":281,"\u002Fnews\u002Finsights\u002Fllm-judge-correlated-errors-surround":579},[4,8,17,21,25,29,33,37,269,273,277],{"title":5,"path":6,"stem":7},"About Thinkata Intelligence","\u002Fabout","about",{"title":9,"path":10,"stem":11,"children":12},"Authentication","\u002Fauth","auth",[13],{"title":14,"path":15,"stem":16},"Email Confirmation","\u002Fauth\u002Fconfirmation","auth\u002Fconfirmation",{"title":18,"path":19,"stem":20},"Case Studies","\u002Fcase-studies","case-studies",{"title":22,"path":23,"stem":24},"Contact Us","\u002Fcontact","contact",{"title":26,"path":27,"stem":28},"Thinkata - Advanced AI Engineering & Multi-Agent System Solutions","\u002F","index",{"title":30,"path":31,"stem":32},"Insights","\u002Finsights","insights",{"title":34,"path":35,"stem":36},"Leadership","\u002Fleadership","leadership",{"title":38,"path":39,"stem":40,"children":41},"News","\u002Fnews","news",[42,45,69],{"title":43,"path":39,"stem":44},"News & Insights","news\u002Findex",{"title":18,"path":46,"stem":47,"children":48},"\u002Fnews\u002Fcase-studies","news\u002Fcase-studies",[49,53,57,61,65],{"title":50,"path":51,"stem":52},"Building Secure and Scalable AI Infrastructure: Integrating with Existing Systems through Modern Cloud Frameworks","\u002Fnews\u002Fcase-studies\u002Fcloud-infrastructure-ai","news\u002Fcase-studies\u002Fcloud-infrastructure-ai",{"title":54,"path":55,"stem":56},"Making Sense of Financial Regulations: How AI Teams Can Tackle Complex Documents","\u002Fnews\u002Fcase-studies\u002Ffinancial-regulations","news\u002Fcase-studies\u002Ffinancial-regulations",{"title":58,"path":59,"stem":60},"AI-Powered Transformations in Healthcare","\u002Fnews\u002Fcase-studies\u002Fhealth-care","news\u002Fcase-studies\u002Fhealth-care",{"title":62,"path":63,"stem":64},"Generative AI in Upstream Natural Gas: Shell's Exploration Initiative","\u002Fnews\u002Fcase-studies\u002Foil-gas","news\u002Fcase-studies\u002Foil-gas",{"title":66,"path":67,"stem":68},"Optimizing Manufacturing with AI-Driven Multi-Agent Systems","\u002Fnews\u002Fcase-studies\u002Fsupply-chain-optimization","news\u002Fcase-studies\u002Fsupply-chain-optimization",{"title":30,"path":70,"stem":71,"children":72},"\u002Fnews\u002Finsights","news\u002Finsights",[73,77,81,85,89,93,97,101,105,109,113,117,121,125,129,133,137,141,145,149,153,157,161,165,169,173,177,181,185,189,193,197,201,205,209,213,217,221,225,229,233,237,241,245,249,253,257,261,265],{"title":74,"path":75,"stem":76},"The Capability-Reliability Split in Agent Systems","\u002Fnews\u002Finsights\u002Fagent-capability-reliability-split","news\u002Finsights\u002Fagent-capability-reliability-split",{"title":78,"path":79,"stem":80},"The Rise of AI Agents in Cyberattacks: Latest Research and Threats","\u002Fnews\u002Finsights\u002Fai-agent-cyber-threats","news\u002Finsights\u002Fai-agent-cyber-threats",{"title":82,"path":83,"stem":84},"The Smart Enterprise AI Stack: Why Teams of AI Agents Beat Solo Models Consistently","\u002Fnews\u002Finsights\u002Fai-architecture","news\u002Finsights\u002Fai-architecture",{"title":86,"path":87,"stem":88},"When Seeing Everything Becomes the Only Option","\u002Fnews\u002Finsights\u002Fai-comprehensive-observability","news\u002Finsights\u002Fai-comprehensive-observability",{"title":90,"path":91,"stem":92},"The Data Infrastructure AI-Native Systems Can't Ignore","\u002Fnews\u002Finsights\u002Fai-data-layer","news\u002Finsights\u002Fai-data-layer",{"title":94,"path":95,"stem":96},"Enterprise AI Triage Systems: Intelligent Automation for Large-Scale Operations","\u002Fnews\u002Finsights\u002Fai-enterprise-triage","news\u002Finsights\u002Fai-enterprise-triage",{"title":98,"path":99,"stem":100},"When Oversight Becomes Infrastructure","\u002Fnews\u002Finsights\u002Fai-governed-autonomy","news\u002Finsights\u002Fai-governed-autonomy",{"title":102,"path":103,"stem":104},"Designing for Graceful Failure in Compound AI Systems","\u002Fnews\u002Finsights\u002Fai-graceful-failure","news\u002Finsights\u002Fai-graceful-failure",{"title":106,"path":107,"stem":108},"Intelligent Composability: Building AI Systems Like Orchestra, Not Soloists","\u002Fnews\u002Finsights\u002Fai-intelligent-composability","news\u002Finsights\u002Fai-intelligent-composability",{"title":110,"path":111,"stem":112},"Building the Plane While Flying It — Migrating from Monolith to AI-Native Without Stopping","\u002Fnews\u002Finsights\u002Fai-migration-path","news\u002Finsights\u002Fai-migration-path",{"title":114,"path":115,"stem":116},"Stability Through Continuous Adaptation","\u002Fnews\u002Finsights\u002Fai-native-overview","news\u002Finsights\u002Fai-native-overview",{"title":118,"path":119,"stem":120},"Provable Stability: Mathematical Guarantees for Adaptive AI Systems","\u002Fnews\u002Finsights\u002Fai-provable-stability","news\u002Finsights\u002Fai-provable-stability",{"title":122,"path":123,"stem":124},"How Temperature Tuning Makes or Breaks Reinforcement Learning","\u002Fnews\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse","news\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse",{"title":126,"path":127,"stem":128},"Testing What Can't Be Predicted","\u002Fnews\u002Finsights\u002Fai-systems-testing","news\u002Finsights\u002Fai-systems-testing",{"title":130,"path":131,"stem":132},"Closing the Loop: How Human Corrections Can Make AI Systems Smarter Over Time","\u002Fnews\u002Finsights\u002Fclosing-the-loop","news\u002Finsights\u002Fclosing-the-loop",{"title":134,"path":135,"stem":136},"Multi-Path Reasoning: Collaborative and Competitive Approaches in AI","\u002Fnews\u002Finsights\u002Fcollaborative-competitive-agents","news\u002Finsights\u002Fcollaborative-competitive-agents",{"title":138,"path":139,"stem":140},"Why Challenges Supercharge Smarts for Humans and AI","\u002Fnews\u002Finsights\u002Fcompetition-improves-ai","news\u002Finsights\u002Fcompetition-improves-ai",{"title":142,"path":143,"stem":144},"Context is Infrastructure, Not Instructions","\u002Fnews\u002Finsights\u002Fcontext-is-infrastructure","news\u002Finsights\u002Fcontext-is-infrastructure",{"title":146,"path":147,"stem":148},"Context is the New Code","\u002Fnews\u002Finsights\u002Fcontext-is-new-code","news\u002Finsights\u002Fcontext-is-new-code",{"title":150,"path":151,"stem":152},"Continuous Thought Machines","\u002Fnews\u002Finsights\u002Fcontinuous-thought-machines","news\u002Finsights\u002Fcontinuous-thought-machines",{"title":154,"path":155,"stem":156},"Don't Vibe, Architect","\u002Fnews\u002Finsights\u002Fdont-vibe-architect","news\u002Finsights\u002Fdont-vibe-architect",{"title":158,"path":159,"stem":160},"The Edge of the Underdefined","\u002Fnews\u002Finsights\u002Fedge-of-the-underdefined","news\u002Finsights\u002Fedge-of-the-underdefined",{"title":162,"path":163,"stem":164},"Experts All the Way Down","\u002Fnews\u002Finsights\u002Fexperts-all-the-way","news\u002Finsights\u002Fexperts-all-the-way",{"title":166,"path":167,"stem":168},"A Multi-Tier Safety Architecture for Critical Applications","\u002Fnews\u002Finsights\u002Ffour-tier-architecture","news\u002Finsights\u002Ffour-tier-architecture",{"title":170,"path":171,"stem":172},"Green Dashboard, Unhappy Users","\u002Fnews\u002Finsights\u002Fgreen-dashboard-unhappy-users","news\u002Finsights\u002Fgreen-dashboard-unhappy-users",{"title":174,"path":175,"stem":176},"Hybrid Autoregressive Residual Tokens","\u002Fnews\u002Finsights\u002Fhart-model","news\u002Finsights\u002Fhart-model",{"title":178,"path":179,"stem":180},"Hierarchical Reasoning in Artificial Intelligence","\u002Fnews\u002Finsights\u002Fhierarchical-approaches","news\u002Finsights\u002Fhierarchical-approaches",{"title":182,"path":183,"stem":184},"Latent Diffusion for Language Generation: A Comprehensive Overview","\u002Fnews\u002Finsights\u002Flatent-diffusion-for-language","news\u002Finsights\u002Flatent-diffusion-for-language",{"title":186,"path":187,"stem":188},"Breaking Language Barriers: How AI Can Translate Without Examples","\u002Fnews\u002Finsights\u002Flearning-languages","news\u002Finsights\u002Flearning-languages",{"title":190,"path":191,"stem":192},"The Emergence of AI Deception: How Large Language Models Have Learned to Strategically Mislead Users","\u002Fnews\u002Finsights\u002Fllm-deception","news\u002Finsights\u002Fllm-deception",{"title":194,"path":195,"stem":196},"Grading on a Shared Curve","\u002Fnews\u002Finsights\u002Fllm-judge-correlated-errors","news\u002Finsights\u002Fllm-judge-correlated-errors",{"title":198,"path":199,"stem":200},"Synergizing Specialized Reasoning and General Capabilities in AI","\u002Fnews\u002Finsights\u002Fllm-reasoning-advances","news\u002Finsights\u002Fllm-reasoning-advances",{"title":202,"path":203,"stem":204},"The AI That Rewrites Itself: MIT's Breakthrough in Self-Adapting Language Models","\u002Fnews\u002Finsights\u002Fllm-seal","news\u002Finsights\u002Fllm-seal",{"title":206,"path":207,"stem":208},"Metacognitive Reinforcement Learning for Self-Improving AI Systems","\u002Fnews\u002Finsights\u002Fmetacognitive-reinforcement-learning","news\u002Finsights\u002Fmetacognitive-reinforcement-learning",{"title":210,"path":211,"stem":212},"Revolutionary Advancements in Mixture of Experts (MoE) Architectures","\u002Fnews\u002Finsights\u002Fmixture-of-experts","news\u002Finsights\u002Fmixture-of-experts",{"title":214,"path":215,"stem":216},"Balancing Neural Plasticity and Stability","\u002Fnews\u002Finsights\u002Fneural-plasticity","news\u002Finsights\u002Fneural-plasticity",{"title":218,"path":219,"stem":220},"Offline RL and the Data Flywheel","\u002Fnews\u002Finsights\u002Foffline-rl-data-flywheel","news\u002Finsights\u002Foffline-rl-data-flywheel",{"title":222,"path":223,"stem":224},"When Optimization Optimizes Itself","\u002Fnews\u002Finsights\u002Frecursive-goodhart","news\u002Finsights\u002Frecursive-goodhart",{"title":226,"path":227,"stem":228},"Reward Design as Architecture","\u002Fnews\u002Finsights\u002Freward-design-as-architecture","news\u002Finsights\u002Freward-design-as-architecture",{"title":230,"path":231,"stem":232},"When Success Has No Author: The Temporal Credit Assignment Problem","\u002Fnews\u002Finsights\u002Frl-credit-assignment-problem","news\u002Finsights\u002Frl-credit-assignment-problem",{"title":234,"path":235,"stem":236},"Beyond Entropy Collapse: When Exploration Succeeds but Learning Fails","\u002Fnews\u002Finsights\u002Frl-optimization-gaps","news\u002Finsights\u002Frl-optimization-gaps",{"title":238,"path":239,"stem":240},"The Path to Practical Confidential Computing for AI Systems","\u002Fnews\u002Finsights\u002Fsecure-ai-architectures","news\u002Finsights\u002Fsecure-ai-architectures",{"title":242,"path":243,"stem":244},"Guess First, Check Later","\u002Fnews\u002Finsights\u002Fspeculative-execution-pattern","news\u002Finsights\u002Fspeculative-execution-pattern",{"title":246,"path":247,"stem":248},"Spiking Neural Networks for Energy-Efficient AI","\u002Fnews\u002Finsights\u002Fspiking-neural-networks","news\u002Finsights\u002Fspiking-neural-networks",{"title":250,"path":251,"stem":252},"The Turn as the Unit of Quality","\u002Fnews\u002Finsights\u002Fstructured-iteration-quality","news\u002Finsights\u002Fstructured-iteration-quality",{"title":254,"path":255,"stem":256},"AI Speech Translation: Breaking Down Language Barriers","\u002Fnews\u002Finsights\u002Fsts-performance-advances","news\u002Finsights\u002Fsts-performance-advances",{"title":258,"path":259,"stem":260},"Test-Time Training Layers: The Next Evolution in Transformer Architecture","\u002Fnews\u002Finsights\u002Ftest-time-training-layers","news\u002Finsights\u002Ftest-time-training-layers",{"title":262,"path":263,"stem":264},"Breakthrough: Large Language Models Pass the Turing Test","\u002Fnews\u002Finsights\u002Fturing-tests","news\u002Finsights\u002Fturing-tests",{"title":266,"path":267,"stem":268},"Training in a World That Does Not Exist Yet","\u002Fnews\u002Finsights\u002Fworld-models-as-infrastructure","news\u002Finsights\u002Fworld-models-as-infrastructure",{"title":270,"path":271,"stem":272},"Privacy Policy","\u002Fprivacy","privacy",{"title":274,"path":275,"stem":276},"Research","\u002Fresearch","research",{"title":278,"path":279,"stem":280},"Terms of Service","\u002Fterms","terms",{"id":282,"title":194,"body":283,"date":561,"description":562,"extension":563,"image":564,"meta":565,"navigation":576,"path":195,"seo":577,"stem":196,"__hash__":578},"insights\u002Fnews\u002Finsights\u002Fllm-judge-correlated-errors.md",{"type":284,"value":285,"toc":550},"minimark",[286,305,315,335,338,342,350,360,364,374,380,390,400,404,413,416,420,423,429,437,444,448,451],[287,288,291,292,291,298],"div",{"className":289},[290],"page-title","\n  ",[293,294,194],"h1",{"className":295,"id":297},[296],"page-title__main","grading-on-a-shared-curve",[299,300,304],"h2",{"className":301,"id":303},[302],"page-title__sub","what-happens-when-the-model-judging-your-ai-system-learned-from-the-same-data-it-did","What Happens When the Model Judging Your AI System Learned From the Same Data It Did",[287,306,308,309],{"style":307},"width: 100%; padding: 2%;","\n    ",[310,311],"img",{"src":312,"alt":313,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1677641905377-e3f6087b8b7a?w=1200&auto=format&fit=crop","An antique brass balance scale in a museum display case, analogous to two measurement instruments calibrated against the same reference weight, where they will agree with each other while both being wrong by the same amount","width: 100%; height: auto;",[316,317,318,319,327,328,334],"p",{},"Two scales calibrated against the same reference weight will agree with each other almost perfectly. They will also be wrong by the same amount, in the same direction, every time. That agreement is not evidence of accuracy. It is evidence of shared calibration. Something structurally similar happens when one language model is used to grade another. The judge and the model under evaluation are usually trained on overlapping slices of the same public text, share architecture lineage and training objectives, and often descend from the same base model. The arrangement is now standard practice. A second model reads the first model's output and scores it for quality, relevance, safety, or correctness, and that score gates a release or fills a dashboard. The appeal is plain, since human review is slow and expensive, and a model that agrees with human raters most of the time reads like a cheap stand-in. One widely cited study found that a strong model used as a judge reaches over 80 percent agreement with human annotators, roughly the rate at which humans agree with one another ",[320,321,322],"sup",{},[323,324,326],"a",{"href":325},"#source-1","[1]",". Surveys now treat the practice, usually called LLM-as-a-judge, as a routine evaluation tool rather than an experiment ",[320,329,330],{},[323,331,333],{"href":332},"#source-2","[2]",".",[316,336,337],{},"The question worth sitting with is what that agreement number actually establishes. When two systems learn from the same data, their errors are not independent. They tend to break in the same places, on the same kinds of inputs, for the same reasons. An evaluation built on that arrangement measures something real, but it may be measuring how well two correlated models agree rather than the quality a user would perceive. A judge that shares a blind spot with the model it scores cannot see into that blind spot any better than the model can.",[299,339,341],{"id":340},"what-agreement-does-and-does-not-establish","What Agreement Does and Does Not Establish",[316,343,344,345,349],{},"High agreement with human raters is necessary for a judge to be useful, but it is not sufficient to trust the judge as a measurement instrument. The same study that reported strong agreement also documented the failure modes that come with model judges, including position bias, where the judge favors whichever answer appears first, verbosity bias, where longer answers score higher regardless of content, and self-enhancement bias, where a model rates outputs that resemble its own more generously than a human would ",[320,346,347],{},[323,348,326],{"href":325},". A judge can hit a high agreement rate on average while still being systematically wrong on the cases that matter most.",[316,351,352,353,359],{},"A 2025 analysis put a formal boundary around this concern. When the judge is no more accurate than the model it evaluates, no debiasing method can reduce the number of ground truth labels needed by more than half, and a high rate of agreement does not, on its own, limit how far a biased judge can distort a comparison between models ",[320,354,355],{},[323,356,358],{"href":357},"#source-5","[5]",". Ground truth here means labels a team actually trusts, typically careful human judgments. The result speaks directly to the case where the goal is to evaluate a model that is as strong as or stronger than the judge, which is exactly the situation when a frontier system is being assessed. The headline agreement statistic and the trustworthiness of the ranking are not the same quantity, and one can be high while the other is low.",[299,361,363],{"id":362},"why-the-errors-line-up","Why the Errors Line Up",[316,365,366,367,373],{},"The reason the errors correlate is that the judge and the evaluated model are drawing on the same priors. Research on self-evaluation found that models such as GPT-4 and Llama 2 can recognize their own outputs out of the box at non-trivial accuracy, and that the strength of a model's preference for its own outputs rises in linear step with how well it can recognize them ",[320,368,369],{},[323,370,372],{"href":371},"#source-3","[3]",". A judge that can tell which text it would have written, and rewards that text, is not grading on the merits. It is grading on familiarity.",[287,375,308,376],{"style":307},[310,377],{"src":378,"alt":379,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1761472084007-b54d19f40123?w=1200&auto=format&fit=crop","A close-up of a guitar headstock with tuning pegs, analogous to tuning an instrument against a fixed external pitch, where matching one guitar to another that is itself slightly flat leaves both consonant with each other yet both below true pitch",[316,381,382,383,389],{},"A tuner does not have an opinion about whether a note sounds pleasant. It reports one fixed reference frequency, and the string is turned until it matches. Tune one guitar to a second guitar that happens to sit slightly flat, and the two will sound in tune with each other while both sit below true pitch. The reference has to come from outside the pair, and that independence is what a model judge lacks. A study probing the mechanism behind self-preference offers a cleaner statement of the problem. Models assign higher scores to text with lower perplexity than human raters do, regardless of who wrote that text ",[320,384,385],{},[323,386,388],{"href":387},"#source-4","[4]",". Perplexity is a measure of how surprised a model is by a passage, or put another way, how likely the model would have been to produce it. Low perplexity means the text sits comfortably inside what the model already expects. So a judge favors the answers it finds familiar, and the answers it finds familiar are the ones shaped like its own training distribution, which is the distribution the evaluated model was largely trained on too. The bias is not a quirk of one model recognizing itself. It is a pull toward the shared center of the data both models came from.",[316,391,392,393,399],{},"There is a related finding that sharpens the worry. Work on reasoning showed that models often fail to correct their own answers without external feedback, and that accuracy sometimes drops after a self-correction pass ",[320,394,395],{},[323,396,398],{"href":397},"#source-7","[7]",". Asking a model to grade another model from the same family is close to asking it to self-correct. If the generator missed something because its priors pointed the wrong way, a judge built on the same priors is likely to miss it for the same reason.",[299,401,403],{"id":402},"golden-sets-have-a-shelf-life","Golden Sets Have a Shelf Life",[316,405,406,407,334],{},"Many teams anchor their evaluation in a golden test set, a fixed collection of inputs paired with answers a human curated and trusts. The assumption is that the set is a stable yardstick. That assumption weakens over time, and data contamination is the main reason. Contamination is the presence of test examples in a model's pre-training data, which lets the model score well by partial recall rather than by the capability the test was meant to probe. A method for detecting it, built on prompting a model to complete withheld portions of known examples, found that GPT-4 had been exposed to several standard datasets, including AG News, WNLI, and XSum, and reported detection accuracy between 92 and 100 percent against expert review ",[320,408,409],{},[323,410,412],{"href":411},"#source-6","[6]",[316,414,415],{},"The practical consequence is that a golden set has a shelf life tied to model release cycles. A set that was clean when it was written can quietly become contaminated once it has been published long enough to be swept into the next pre-training run. When the base model version under the judge changes, the evaluator's priors shift with it, and scores on a familiar set drift for reasons that have nothing to do with the system being tested. This is why it helps to hold part of the golden data outside the model's training window, using freshly authored examples that have never been posted publicly, so at least one slice of the evaluation is measuring capability rather than memory.",[299,417,419],{"id":418},"what-a-production-eval-architecture-looks-like","What a Production Eval Architecture Looks Like",[316,421,422],{},"Treating the model judge as one layer rather than the whole evaluation is what keeps these failure modes contained. A workable architecture tends to have three layers, ordered from cheapest and most trustworthy to most expensive and most subjective.",[287,424,308,425],{"style":307},[310,426],{"src":427,"alt":428,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1760445528355-19c965df8d4d?w=1200&auto=format&fit=crop","Flour being sifted through a fine sieve into a bowl, analogous to a graded screen where a coarse, cheap pass catches the obvious lumps before anything reaches a finer and more expensive stage",[316,430,431,432,436],{},"Sifting works because the coarse screen runs first and catches the obvious lumps cheaply, so only what passes through reaches the finer, slower stages. An evaluation harness benefits from the same ordering. The first layer is deterministic. Schema validation, exact-match checks, unit tests, regular expressions, and other hard rules catch a large share of failures at almost no cost, and they carry no correlated-error risk because a passing test is an external fact rather than another model's opinion. The second layer is model-based scoring, reserved for the dimensions that resist hard rules, such as tone, helpfulness, or faithfulness to a source. This is where the safeguards matter. Drawing the judge from a different model family than the one under test reduces shared blind spots, randomizing answer order blunts position bias, and hiding which system produced an output limits self-recognition effects ",[320,433,434],{},[323,435,333],{"href":332},". The third layer is a periodic human-labeled sample, small but regular, that serves as the ground truth the other two layers are calibrated against.",[316,438,439,440,334],{},"Detecting evaluator drift is the part teams most often skip. A simple practice is to keep a frozen reference set of outputs with settled scores and re-run the judge against it whenever the judge model or its version changes. If the scores move while the outputs have not, the judge has drifted, and any trend measured across that boundary is suspect. The same frozen set surfaces the slow contamination problem, since a judge that suddenly finds familiar examples easier is telling on itself. None of this removes the value of a model judge. The formal result on frontier evaluation is a reminder of the ceiling rather than a reason to abandon the tool, since debiasing against a modest pool of trusted labels still helps, just not without limit ",[320,441,442],{},[323,443,358],{"href":357},[299,445,447],{"id":446},"what-this-suggests","What This Suggests",[316,449,450],{},"The evidence points toward a modest reframing rather than a rejection of model judges. An LLM judge is a useful, scalable signal, but it is not the same kind of instrument as a compiler, a passing test, or a held-out human label, because it shares its priors with the thing it measures. The cases where that matters most are the ones a team most wants to get right, the novel inputs, the stronger model, the subtle failure that sits inside a blind spot both models inherited from the same data. Layering the judge behind deterministic checks, anchoring it to a human sample, watching it for drift, and holding some evaluation data outside the training window are not heavy additions. They are what turns a mirror back into a measurement.",[287,452,291,456,291,459],{"className":453},[454,455],"references","mt-8",[299,457,458],{"id":454},"References",[460,461,308,467,308,485,308,497,308,508,308,518,308,530,308,540,291],"ol",{"className":462},[463,464,465,466],"list-decimal","list-inside","space-y-2","mt-4",[468,469,471,472,476,477],"li",{"id":470},"source-1","L. Zheng et al., \"Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena,\" in ",[473,474,475],"em",{},"Proc. NeurIPS Datasets and Benchmarks Track",", 2023, ",[323,478,484],{"href":479,"target":480,"className":481},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.05685","_blank",[482,483],"text-blue-600","underline","[Online]",[468,486,488,489,492,493],{"id":487},"source-2","J. Gu et al., \"A Survey on LLM-as-a-Judge,\" ",[473,490,491],{},"arXiv",", 2024, ",[323,494,484],{"href":495,"target":480,"className":496},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2411.15594",[482,483],[468,498,500,501,492,504],{"id":499},"source-3","A. Panickssery, S. R. Bowman, and S. Feng, \"LLM Evaluators Recognize and Favor Their Own Generations,\" in ",[473,502,503],{},"Proc. 38th Conf. Neural Inf. Process. Syst. (NeurIPS)",[323,505,484],{"href":506,"target":480,"className":507},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2404.13076",[482,483],[468,509,511,512,492,514],{"id":510},"source-4","K. Wataoka, T. Takahashi, and R. Ri, \"Self-Preference Bias in LLM-as-a-Judge,\" ",[473,513,491],{},[323,515,484],{"href":516,"target":480,"className":517},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.21819",[482,483],[468,519,521,522,525,526],{"id":520},"source-5","F. Dorner, V. Nastl, and M. Hardt, \"Limits to Scalable Evaluation at the Frontier: LLM as Judge Won't Beat Twice the Data,\" in ",[473,523,524],{},"Proc. International Conference on Learning Representations (ICLR)",", 2025, ",[323,527,484],{"href":528,"target":480,"className":529},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.13341",[482,483],[468,531,533,534,492,536],{"id":532},"source-6","S. Golchin and M. Surdeanu, \"Time Travel in LLMs: Tracing Data Contamination in Large Language Models,\" in ",[473,535,524],{},[323,537,484],{"href":538,"target":480,"className":539},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2308.08493",[482,483],[468,541,543,544,492,546],{"id":542},"source-7","J. Huang et al., \"Large Language Models Cannot Self-Correct Reasoning Yet,\" in ",[473,545,524],{},[323,547,484],{"href":548,"target":480,"className":549},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2310.01798",[482,483],{"title":551,"searchDepth":552,"depth":552,"links":553},"",2,[554,555,556,557,558,559,560],{"id":303,"depth":552,"text":304},{"id":340,"depth":552,"text":341},{"id":362,"depth":552,"text":363},{"id":402,"depth":552,"text":403},{"id":418,"depth":552,"text":419},{"id":446,"depth":552,"text":447},{"id":454,"depth":552,"text":458},"2026-06-05","Most teams now use one language model to score another, and a judge that agrees with human raters most of the time looks like a cheap substitute for review. The harder question is what that agreement establishes when the judge and the model it scores learned from the same data and tend to fail in the same places.","md",{"src":312},{"authors":566,"badge":572,"source":574},[567],{"avatar":568,"name":570,"to":571},{"src":569},"\u002Fimg\u002Fmark_avatar.png","Mark Williams","https:\u002F\u002Fthinkata.com",{"label":573},"AI Evaluation",{"name":575,"url":571},"Thinkata Research",true,{"title":194,"description":562},"Q9s8Rdhe0naZWzsZ4IL1LVANG6JAFMJjnufjZ-3LrKI",[580,812],{"id":581,"title":170,"body":582,"date":801,"description":802,"extension":563,"image":803,"meta":804,"navigation":576,"path":171,"seo":810,"stem":172,"__hash__":811,"_path":171},"insights\u002Fnews\u002Finsights\u002Fgreen-dashboard-unhappy-users.md",{"type":284,"value":583,"toc":791},[584,596,602,611,614,618,624,632,635,639,647,650,654,662,670,674,680,688,691,695,721,725,728],[287,585,291,587,291,591],{"className":586},[290],[293,588,170],{"className":589,"id":590},[296],"green-dashboard-unhappy-users",[299,592,595],{"className":593,"id":594},[302],"why-passing-every-eval-doesnt-mean-your-ai-works","Why Passing Every Eval Doesn't Mean Your AI Works",[287,597,308,598],{"style":307},[310,599],{"src":600,"alt":601,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1762163516269-3c143e04175c?w=1200&auto=format&fit=crop","A rack of status lights all glowing green from edge to edge, analogous to a monitoring dashboard where every indicator reads healthy while the question that matters, whether the people on the other end are well served, is not one any of these lights measures",[316,603,604,605,607,608,610],{},"A team ships a model update. The evaluation suite runs, every check comes back green, the automated quality score clears its threshold. A week later the support queue is filling and the usage curve is bending the wrong way. The usual explanations for this gap are by now well documented, and most of them blame the test. The grader might share the model's blind spots, the problem traced in ",[323,606,194],{"href":195},". The single score might hide how much the system wobbles from one run to the next, the subject of ",[323,609,74],{"href":75},". Both are stories about a flawed instrument.",[316,612,613],{},"Set them aside for a moment. Suppose the eval is clean, stable, free of contamination, and scored honestly. A green dashboard can still sit above unhappy users for a reason that has nothing to do with the test being broken. The dashboard measures the model, by itself, on a fixed set of inputs, at one point in time. The user lives with the whole system, in back-and-forth, over weeks. The model, the human-AI team, and the running experience are three different things, and an offline eval only ever sees the first. Three fields that have been living this exact gap for years, recommender systems, human-computer interaction, and online experimentation, have both the evidence and the vocabulary worth borrowing.",[299,615,617],{"id":616},"the-soloist-and-the-duet","The Soloist and the Duet",[287,619,308,620],{"style":307},[310,621],{"src":622,"alt":623,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1516016906593-866c0d0356af?w=1200&auto=format&fit=crop","A man and a woman playing string instruments together as a duet, analogous to a human-AI team, where what matters is how the pair sounds together rather than how well either musician would score playing alone",[316,625,626,627,631],{},"A clean recording of a soloist says very little about how the duet will sound. The thing a user experiences is the duet, the model and the person working through a task together, and that joint performance is what an eval of the model alone leaves out. Bansal and colleagues studied this directly with an AI whose accuracy was comparable to the humans it assisted, asking whether the pair could reach complementary performance, where the team is more accurate than either the person or the model working solo ",[320,628,629],{},[323,630,326],{"href":325},". Adding the AI helped. Adding explanations of the AI's reasoning, the feature most often assumed to help further, did not increase complementarity. Instead, explanations raised the chance that people accepted the AI's recommendation regardless of whether it was correct.",[316,633,634],{},"That last detail is the uncomfortable one. A model can post a higher solo score and leave the team no better off, and a change meant to improve the experience can make the team worse by encouraging misplaced trust. The dashboard grades the soloist. The user hears the duet, and the duet has its own failure modes that the solo score cannot register.",[299,636,638],{"id":637},"offline-wins-do-not-always-survive-contact","Offline Wins Do Not Always Survive Contact",[316,640,641,642,646],{},"Recommender systems have run this comparison for real, with users instead of held-out data. Rossetti, Stella, and Zanker used a within-users design, the same people in both settings, to compare how algorithms ranked by offline accuracy against how they ranked in an online, user-centric study ",[320,643,644],{},[323,645,333],{"href":332},". The two rankings contradicted each other. The algorithm that looked best on the offline metric was significantly worse online at producing recommendations users actually found useful, meaning both relevant and novel. The authors put it plainly, the external validity of the most common offline evaluation method is not guaranteed.",[316,648,649],{},"That is roughly a decade of green dashboards over unhappy users, documented in a field that depends on getting recommendations right. The practical reading is modest and useful. An offline eval is a screen, a cheap way to decide which candidates deserve a more expensive test. It is not the verdict. Treating the offline ranking as the answer is how a system clears every check and still disappoints the people it was built for.",[299,651,653],{"id":652},"the-proxy-you-reach-for-online-bites-back","The Proxy You Reach For Online Bites Back",[316,655,656,657,661],{},"The obvious response is to stop measuring proxies and measure users. The catch is that online measurement is also a proxy. What gets logged is behavior, clicks, session length, a thumbs up, whether someone came back, and behavior is not the same as value. Kleinberg, Mullainathan, and Raghavan modeled what happens when a well-meaning platform optimizes engagement while users hold inconsistent preferences, wanting one thing in the moment and another on reflection ",[320,658,659],{},[323,660,372],{"href":371},". Engagement and genuine utility can pull apart. Their image for it is chips and salad. For some content, the fact that people consume more of it tracks real value, the way the most-watched calculus tutorial may simply be the best one. For other content, consuming more signals a pull people would not endorse on reflection, the way a bag of chips disappears. Users can spend long sessions and get little from them, and a change can lift engagement for a while before they quit abruptly.",[316,663,664,665,669],{},"Time makes this harder to read. Drawing from online experiments, Kohavi and colleagues catalog why short experiments mislead, including novelty and primacy effects, where a change looks like a win for the first couple of weeks mainly because it is new, then fades once the novelty wears off ",[320,666,667],{},[323,668,388],{"href":387},". Their central advice is to choose an Overall Evaluation Criterion, a metric deliberately tied to long-term value rather than a short-term bump, and to run experiments long enough to see past the early effect. A green two-week A\u002FB test and a satisfied user a quarter later are different claims, and only one of them is on the dashboard.",[299,671,673],{"id":672},"some-of-works-was-never-going-into-a-test","Some of \"Works\" Was Never Going Into a Test",[287,675,308,676],{"style":307},[310,677],{"src":678,"alt":679,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1555992457-720eb4e75880?w=1200&auto=format&fit=crop","An empty retro diner lit by neon, analogous to silent churn, where the empty booths record that customers left but never record why, so the remaining feedback comes only from the people who stayed",[316,681,682,683,687],{},"Underneath all of this sits a limit that no amount of better instrumentation removes. Ackerman, in a foundational paper on computer-supported work, named the socio-technical gap, the divide between what a system needs to support socially and what it can actually be built to support technically, because human activity is flexible, nuanced, and context-dependent while technical mechanisms stay rigid ",[320,684,685],{},[323,686,358],{"href":357},". For an AI product, part of whether it works for a particular person lives in context an eval cannot enumerate, the user's unstated goal, how much weight they place on a confidently wrong answer, the stakes of the moment, the social setting the output lands in. Some of that is not in the test set, and some of it could not be put there.",[316,689,690],{},"An empty diner sharpens the point. The bare booths record that people left, not why they left. Dissatisfied users mostly do not file tickets, they simply stop coming back, so the feedback that would correct the dashboard is filtered by who chose to stay. The dashboard hears from the survivors and infers contentment from their presence, which is exactly the population least likely to report the problem that drove everyone else away.",[299,692,694],{"id":693},"measuring-the-team-and-the-experience","Measuring the Team and the Experience",[316,696,697,698,702,703,707,708,712,716,717,334],{},"None of this argues against evals, and none of it asks for a single new number. It argues for widening what the dashboard is allowed to count as evidence. Scoring the human-AI team rather than the model alone, by tracking whether the pair outperforms either side solo and whether reliance on the model is appropriate rather than automatic, brings the duet back into view ",[320,699,700],{},[323,701,326],{"href":325},". Treating an offline eval as a screen that earns a candidate the right to an online test, and letting the online result rather than the offline ranking make the call, respects what recommender systems learned the hard way ",[320,704,705],{},[323,706,333],{"href":332},". Choosing a long-term evaluation criterion, watching for behavioral proxies that drift away from it, and running experiments past the novelty window keep the online dashboard from telling a flattering short story ",[320,709,710],{},[323,711,372],{"href":371},[320,713,714],{},[323,715,388],{"href":387},". And instrumenting the interaction itself, recovery after a bad turn, escalation rate, repeat use, numbers broken out by user segment rather than averaged into one cheerful figure, narrows the part of the gap that can be narrowed, while a human channel stays open for the part that cannot ",[320,718,719],{},[323,720,358],{"href":357},[299,722,724],{"id":723},"what-the-green-actually-certifies","What the Green Actually Certifies",[316,726,727],{},"A green offline dashboard certifies something narrow, and stating it plainly is half the cure. The model did well on a fixed set of inputs, by itself, at one moment, measured by a quantity that may or may not be the one users feel. It does not certify that the human-AI team did well, that the result holds up against live traffic, that the proxy being optimized is the thing users actually value, or that the unmeasurable remainder is being handled at all. The distance between scoring a model and serving a person is where the unhappy users are, and that distance does not show up on a dashboard built to watch the model. Closing it starts with measuring the second thing, not assuming the first one stands in for it.",[287,729,291,731,291,733],{"className":730},[454,455],[299,732,458],{"id":454},[460,734,308,736,308,747,308,758,308,769,308,780,291],{"className":735},[463,464,465,466],[468,737,738,739,742,743],{"id":470},"G. Bansal et al., \"Does the Whole Exceed its Parts? The Effect of AI Explanations on Complementary Team Performance,\" in ",[473,740,741],{},"Proc. 2021 CHI Conference on Human Factors in Computing Systems (CHI '21)",", 2021. DOI: ",[323,744,484],{"href":745,"target":480,"className":746},"https:\u002F\u002Fdoi.org\u002F10.1145\u002F3411764.3445717",[482,483],[468,748,749,750,753,754],{"id":487},"M. Rossetti, F. Stella, and M. Zanker, \"Contrasting Offline and Online Results when Evaluating Recommendation Algorithms,\" in ",[473,751,752],{},"Proc. 10th ACM Conference on Recommender Systems (RecSys '16)",", 2016, pp. 31–34. DOI: ",[323,755,484],{"href":756,"target":480,"className":757},"https:\u002F\u002Fdoi.org\u002F10.1145\u002F2959100.2959176",[482,483],[468,759,760,761,764,765],{"id":499},"J. Kleinberg, S. Mullainathan, and M. Raghavan, \"The Challenge of Understanding What Users Want: Inconsistent Preferences and Engagement Optimization,\" ",[473,762,763],{},"Management Science",", vol. 70, no. 9, pp. 6336–6355, 2024. DOI: ",[323,766,484],{"href":767,"target":480,"className":768},"https:\u002F\u002Fdoi.org\u002F10.1287\u002Fmnsc.2022.03683",[482,483],[468,770,771,772,775,776],{"id":510},"R. Kohavi et al., \"Trustworthy Online Controlled Experiments: Five Puzzling Outcomes Explained,\" in ",[473,773,774],{},"Proc. 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '12)",", 2012, pp. 786–794. DOI: ",[323,777,484],{"href":778,"target":480,"className":779},"https:\u002F\u002Fdoi.org\u002F10.1145\u002F2339530.2339653",[482,483],[468,781,782,783,786,787],{"id":520},"M. S. Ackerman, \"The Intellectual Challenge of CSCW: The Gap Between Social Requirements and Technical Feasibility,\" ",[473,784,785],{},"Human-Computer Interaction",", vol. 15, no. 2-3, pp. 179–203, 2000. DOI: ",[323,788,484],{"href":789,"target":480,"className":790},"https:\u002F\u002Fdoi.org\u002F10.1207\u002FS15327051HCI1523_5",[482,483],{"title":551,"searchDepth":552,"depth":552,"links":792},[793,794,795,796,797,798,799,800],{"id":594,"depth":552,"text":595},{"id":616,"depth":552,"text":617},{"id":637,"depth":552,"text":638},{"id":652,"depth":552,"text":653},{"id":672,"depth":552,"text":673},{"id":693,"depth":552,"text":694},{"id":723,"depth":552,"text":724},{"id":454,"depth":552,"text":458},"2026-06-13","Suppose the eval is clean, stable, and honestly scored. A green dashboard can still sit above unhappy users, because it measures the model alone, on fixed inputs, at one moment, while the user lives with the whole system, in back-and-forth, over weeks. Three fields that have lived this gap have the evidence for it.",{"src":600},{"authors":805,"badge":808,"source":809},[806],{"avatar":807,"name":570,"to":571},{"src":569},{"label":573},{"name":575,"url":571},{"title":170,"description":802},"jU0wYYIVBiy-h3OtKktxcE5wMAD-MmLUJv5ZS3Qbqis",{"id":813,"title":242,"body":814,"date":1070,"description":1071,"extension":563,"image":1072,"meta":1073,"navigation":576,"path":243,"seo":1080,"stem":244,"__hash__":1081,"_path":243},"insights\u002Fnews\u002Finsights\u002Fspeculative-execution-pattern.md",{"type":284,"value":815,"toc":1061},[816,828,834,837,840,844,856,864,877,881,884,887,893,901,909,913,921,924,928,935,952,958,961,963,966,969],[287,817,291,819,291,823],{"className":818},[290],[293,820,242],{"className":821,"id":822},[296],"guess-first-check-later",[299,824,827],{"className":825,"id":826},[302],"speculative-execution-as-an-architectural-pattern-across-serving-reasoning-and-agents","Speculative Execution as an Architectural Pattern Across Serving, Reasoning, and Agents",[287,829,308,830],{"style":307},[310,831],{"src":832,"alt":833,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1518770660439-4636190af475?w=1200&auto=format&fit=crop","Close-up of a circuit board with intricate interconnected pathways, analogous to the processor technique of speculative execution that language model serving has borrowed",[316,835,836],{},"A processor that reaches a branch in a program does not wait to learn which way the branch goes. It predicts the likely path, runs ahead on that guess, and keeps the work if the guess was right or discards it if the guess was wrong. The technique is called speculative execution, and it has been part of computer architecture for decades. It pays off because speculating is cheap relative to waiting, and checking a guess is cheap relative to producing the answer from scratch. That asymmetry in cost is the whole reason the trick is worth the bookkeeping.",[316,838,839],{},"Large language models turn out to have the same asymmetry in several places, which may be why the same idea keeps getting rediscovered under different names. A pattern worth naming is one that shows up in more than one corner of a system, and the draft-then-verify shape now appears in token generation, code production, retrieval, and agent planning. Treating these as one pattern, rather than four unrelated tricks, suggests where the shared design problems and shared mistakes are likely to live.",[299,841,843],{"id":842},"one-pattern-several-names","One Pattern, Several Names",[316,845,846,847,851,852,334],{},"The clearest instance is speculative decoding, introduced for transformer inference in 2022. The observation behind it is that generating text one token at a time is slow not because the arithmetic is heavy but because the hardware spends most of its time moving the model's parameters from memory for each single token. A small, fast draft model proposes several tokens. The large target model then checks all of those proposed tokens in a single pass, which costs about the same as producing one token on its own, and a sampling rule accepts the longest prefix that matches what the large model would have produced anyway. The work demonstrated a two to three times speedup on a large model with identical outputs, no retraining, and no architecture change ",[320,848,849],{},[323,850,326],{"href":325},". A parallel effort at DeepMind arrived at the same core method independently, reporting roughly a two to two and a half times speedup on a 70 billion parameter model while preserving the target model's output distribution exactly through a modified rejection sampling scheme ",[320,853,854],{},[323,855,333],{"href":332},[316,857,858,859,863],{},"A 2024 survey of the area makes the lineage explicit. It describes speculative decoding as an adaptation of speculative execution from computer architecture, the same optimization where tasks are performed in advance and then verified for whether they were needed ",[320,860,861],{},[323,862,372],{"href":371},". The survey also names the two design questions that govern whether the pattern helps. The first is how to build a drafter that balances speculation accuracy against drafting cost. The second is whether the verification step can stay parallel while still guaranteeing output quality. Both questions reappear, in slightly different clothing, every other place the pattern shows up.",[316,865,866,867,871,872,876],{},"Two later variants are worth noting because they show how much room the drafter side has. Medusa drops the separate draft model entirely and adds small extra prediction heads to the existing model, which propose several future tokens that a tree-based attention step verifies together, reporting roughly two to three and a half times speedup without a second model to maintain ",[320,868,869],{},[323,870,388],{"href":387},". EAGLE moves the drafting down to the model's internal feature representations rather than its output tokens and reports a 2.7 to 3.5 times latency improvement while keeping the generated distribution unchanged ",[320,873,874],{},[323,875,358],{"href":357},". The verification half stays constant across these variants. What changes is how cheaply and accurately the guess gets made.",[299,878,880],{"id":879},"the-same-shape-outside-token-generation","The Same Shape Outside Token Generation",[316,882,883],{},"The reason to treat this as a pattern rather than an inference trick is that the draft-then-verify structure is not specific to tokens. It appears wherever cheap generation under uncertainty can be paired with a more trustworthy and relatively cheap check.",[316,885,886],{},"Code generation is the most familiar case. A model proposes an implementation, and a deterministic tool decides whether the proposal is acceptable. The tool might be a compiler, a type checker, or a test suite. The generator does not need to be right on the first attempt. It needs to be right often enough that the combined cost of generating and checking beats the cost of a slow, careful, single pass. The verifier here has a quality that the token-level case has to work hard to approximate, which is that a compiler or a passing test is an external and largely objective judgment rather than another opinion from the same family of model.",[287,888,308,889],{"style":307},[310,890],{"src":891,"alt":892,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1769142919507-8ec02ea9711c?w=1200&auto=format&fit=crop","A metal ruler laid across printed text on a page, analogous to a verifier checking a cheaply produced draft against a fixed and external standard",[316,894,895,896,900],{},"A ruler laid across a printed page does not write the text. It measures it against a fixed standard, and that division of labor is the heart of the pattern. Retrieval-augmented generation follows the same division. A fast similarity search over a vector index guesses which documents are likely to be relevant, and a reader model checks those candidates and uses the useful ones. The original retrieval-augmented generation work combined a pretrained generator with a dense vector index of Wikipedia accessed by a neural retriever, and found the combination produced more specific and more factual output than the generator alone ",[320,897,898],{},[323,899,412],{"href":411},". The retriever is speculating about relevance. The reader corrects that speculation. The asymmetry holds, since the lookup is fast and the reader's pass over a handful of candidates is far cheaper than reasoning without any retrieval at all.",[316,902,903,904,908],{},"Reasoning shows the pattern too, and it is where the cost balance gets most interesting. An early and influential result on grade-school math problems trained a separate verifier to judge the correctness of candidate solutions, generated many candidates at test time, and selected the one the verifier ranked highest. Verification improved accuracy and scaled better with more data than simply fine-tuning the generator harder ",[320,905,906],{},[323,907,398],{"href":397},". Generating several cheap candidate solutions and spending the expensive judgment on selection is the same move as drafting several cheap tokens and spending the expensive forward pass on acceptance.",[299,910,912],{"id":911},"what-makes-the-pattern-pay-off","What Makes the Pattern Pay Off",[316,914,915,916,920],{},"Every instance of draft-then-verify lives or dies on one number, which is how often the verifier accepts the draft. If the drafter is poorly matched to the verifier, the verifier rejects nearly everything, and the system pays for two models while getting the output of one. The speculative decoding survey frames this as the central tension of drafter design, the trade between how accurate the speculation is and how cheap it is to produce ",[320,917,918],{},[323,919,372],{"href":371},". A better drafter raises the acceptance rate but costs more to run, which narrows the very advantage the pattern exists to capture. There is an operating point that depends on the acceptance rate, the cost of the verifier, and the cost ratio between drafter and verifier, and from a systems perspective it is striking how often that operating point is chosen by intuition rather than measured.",[316,922,923],{},"The useful consequence of seeing these cases as one pattern is that the calibration lessons transfer. Acceptance rate in speculative decoding, compilation pass rate in code generation, and verifier selection rate in reasoning are the same quantity wearing different labels. A team that has learned how sensitive token-level speedup is to draft-target alignment already knows something about why a code agent that drafts with one model and verifies with mismatched tests will stall. The drafter and the checker have to agree often enough, on the right things, for the arrangement to be worth its overhead.",[299,925,927],{"id":926},"the-verifier-is-the-weak-point","The Verifier Is the Weak Point",[316,929,930,931,334],{},"The pattern is only as trustworthy as its verifier, and verifiers are not all equally trustworthy. A deterministic check is the strongest kind. A compiler, a type system, a test suite, or a rejection sampling rule that provably preserves a distribution gives a hard signal that is external to the model doing the guessing. The speculative decoding results are reassuring precisely because their verification step is a mathematical guarantee about the output distribution rather than a judgment call ",[320,932,933],{},[323,934,333],{"href":332},[316,936,937,938,944,945,951],{},"The trouble starts when the verifier is itself a language model. Using a strong model as a judge can approximate human preference well, reaching over 80 percent agreement with human raters in one widely cited study, but the same work documents the failure modes that come with it, including position bias, verbosity bias, and a self-enhancement bias where a model tends to favor outputs that resemble its own ",[320,939,940],{},[323,941,943],{"href":942},"#source-8","[8]",". A verifier that prefers answers shaped like its own guesses is a weak check on a drafter from the same model family, since the two share blind spots. The risk compounds when the verifier and the drafter are the same model asked to grade itself. Research on reasoning found that models often fail to correct their own answers without external feedback, and that performance sometimes degrades after a self-correction pass ",[320,946,947],{},[323,948,950],{"href":949},"#source-9","[9]",". The draft-then-verify pattern inherits that finding directly. If the verification step is just the generator in a more skeptical voice, it may not be catching much.",[287,953,308,954],{"style":307},[310,955],{"src":956,"alt":957,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1635859890085-ec8cb5466806?w=1200&auto=format&fit=crop","A person reviewing and signing layered documents, analogous to the verifier acting as the gate that decides whether a cheaply produced draft is accepted",[316,959,960],{},"A signature on a reviewed document is a gate, and the value of the gate depends entirely on whether the reviewer can actually see the errors. The same is true for any system built on this pattern. The choice of verifier may be the most consequential design decision in the whole arrangement, more consequential than the choice of drafter, because a fast drafter paired with a weak verifier produces fast output that no one should trust, while a modest drafter paired with a hard external check produces output a team can stand behind.",[299,962,447],{"id":446},[316,964,965],{},"What to do when a draft is rejected is a third decision, and it is usually made implicitly. The options include discarding and regenerating, regenerating with the rejection as feedback, falling back to the expensive path for that one request, or escalating to a person. Each carries a different cost and quality profile, and each is the kind of routing decision worth logging and tuning rather than hardcoding, in the same way other branch points in a model serving stack get instrumented.",[316,967,968],{},"The honest summary is that many teams are already running this pattern in more than one place without recognizing it as one pattern. A serving team tunes speculative decoding acceptance rates. A coding-agent team tunes how often generated code passes its tests. A retrieval team tunes how many candidates the reader has to sift. These are the same problem, which means the calibration tooling, the verifier-quality cautions, and the rejection-handling policies could be shared rather than rebuilt three times. The pattern is simple to state, which is to guess cheaply, check with something more trustworthy, and keep the work only if it survives the check. The engineering judgment lives almost entirely in how cheap the guess really is and how much the check can actually be trusted.",[287,970,291,972,291,974],{"className":971},[454,455],[299,973,458],{"id":454},[460,975,308,977,308,987,308,996,308,1006,308,1015,308,1024,308,1035,308,1045,308,1053,291],{"className":976},[463,464,465,466],[468,978,979,980,476,983],{"id":470},"Y. Leviathan, M. Kalman, and Y. Matias, \"Fast Inference from Transformers via Speculative Decoding,\" in ",[473,981,982],{},"Proc. International Conference on Machine Learning (ICML)",[323,984,484],{"href":985,"target":480,"className":986},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2211.17192",[482,483],[468,988,989,990,476,992],{"id":487},"C. Chen, S. Borgeaud, G. Irving, J.-B. Lespiau, L. Sifre, and J. Jumper, \"Accelerating Large Language Model Decoding with Speculative Sampling,\" ",[473,991,491],{},[323,993,484],{"href":994,"target":480,"className":995},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2302.01318",[482,483],[468,997,998,999,492,1002],{"id":499},"H. Xia et al., \"Unlocking Efficiency in Large Language Model Inference: A Comprehensive Survey of Speculative Decoding,\" in ",[473,1000,1001],{},"Findings of the Association for Computational Linguistics (ACL)",[323,1003,484],{"href":1004,"target":480,"className":1005},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.07851",[482,483],[468,1007,1008,1009,492,1011],{"id":510},"T. Cai et al., \"Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads,\" ",[473,1010,491],{},[323,1012,484],{"href":1013,"target":480,"className":1014},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.10774",[482,483],[468,1016,1017,1018,492,1020],{"id":520},"Y. Li, F. Wei, C. Zhang, and H. Zhang, \"EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty,\" ",[473,1019,491],{},[323,1021,484],{"href":1022,"target":480,"className":1023},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2401.15077",[482,483],[468,1025,1026,1027,1030,1031],{"id":532},"P. Lewis et al., \"Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks,\" in ",[473,1028,1029],{},"Proc. 33rd Int. Conf. Neural Inf. Process. Syst. (NeurIPS)",", 2020. arXiv preprint DOI: ",[323,1032,484],{"href":1033,"target":480,"className":1034},"https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2005.11401",[482,483],[468,1036,1037,1038,1040,1041],{"id":542},"K. Cobbe et al., \"Training Verifiers to Solve Math Word Problems,\" ",[473,1039,491],{},", 2021, ",[323,1042,484],{"href":1043,"target":480,"className":1044},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2110.14168",[482,483],[468,1046,471,1048,476,1050],{"id":1047},"source-8",[473,1049,475],{},[323,1051,484],{"href":479,"target":480,"className":1052},[482,483],[468,1054,543,1056,492,1058],{"id":1055},"source-9",[473,1057,524],{},[323,1059,484],{"href":548,"target":480,"className":1060},[482,483],{"title":551,"searchDepth":552,"depth":552,"links":1062},[1063,1064,1065,1066,1067,1068,1069],{"id":826,"depth":552,"text":827},{"id":842,"depth":552,"text":843},{"id":879,"depth":552,"text":880},{"id":911,"depth":552,"text":912},{"id":926,"depth":552,"text":927},{"id":446,"depth":552,"text":447},{"id":454,"depth":552,"text":458},"2026-05-29","Speculative decoding made large language models faster by drafting cheaply and verifying expensively. The same draft-verify shape now shows up in code generation, retrieval, and agent planning, which raises the question of whether teams are solving the same design problem several times without noticing it is one pattern.",{"src":832},{"authors":1074,"badge":1077,"source":1079},[1075],{"avatar":1076,"name":570,"to":571},{"src":569},{"label":1078},"AI Architecture",{"name":575,"url":571},{"title":242,"description":1071},"tUS0AFcpRf3cNqA4cbQZeSlD3K3PtYAFkmkhj6uurPE",1782047594408]