[{"data":1,"prerenderedAt":1332},["ShallowReactive",2],{"navigation":3,"\u002Fnews\u002Finsights\u002Fcontext-is-infrastructure":261,"\u002Fnews\u002Finsights\u002Fcontext-is-infrastructure-surround":623},[4,8,17,21,25,29,33,37,249,253,257],{"title":5,"path":6,"stem":7},"About Thinkata Intelligence","\u002Fabout","about",{"title":9,"path":10,"stem":11,"children":12},"Authentication","\u002Fauth","auth",[13],{"title":14,"path":15,"stem":16},"Email Confirmation","\u002Fauth\u002Fconfirmation","auth\u002Fconfirmation",{"title":18,"path":19,"stem":20},"Case Studies","\u002Fcase-studies","case-studies",{"title":22,"path":23,"stem":24},"Contact Us","\u002Fcontact","contact",{"title":26,"path":27,"stem":28},"Thinkata - Advanced AI Engineering & Multi-Agent System Solutions","\u002F","index",{"title":30,"path":31,"stem":32},"Insights","\u002Finsights","insights",{"title":34,"path":35,"stem":36},"Leadership","\u002Fleadership","leadership",{"title":38,"path":39,"stem":40,"children":41},"News","\u002Fnews","news",[42,45,69],{"title":43,"path":39,"stem":44},"News & Insights","news\u002Findex",{"title":18,"path":46,"stem":47,"children":48},"\u002Fnews\u002Fcase-studies","news\u002Fcase-studies",[49,53,57,61,65],{"title":50,"path":51,"stem":52},"Building Secure and Scalable AI Infrastructure: Integrating with Existing Systems through Modern Cloud Frameworks","\u002Fnews\u002Fcase-studies\u002Fcloud-infrastructure-ai","news\u002Fcase-studies\u002Fcloud-infrastructure-ai",{"title":54,"path":55,"stem":56},"Making Sense of Financial Regulations: How AI Teams Can Tackle Complex Documents","\u002Fnews\u002Fcase-studies\u002Ffinancial-regulations","news\u002Fcase-studies\u002Ffinancial-regulations",{"title":58,"path":59,"stem":60},"AI-Powered Transformations in Healthcare","\u002Fnews\u002Fcase-studies\u002Fhealth-care","news\u002Fcase-studies\u002Fhealth-care",{"title":62,"path":63,"stem":64},"Generative AI in Upstream Natural Gas: Shell's Exploration Initiative","\u002Fnews\u002Fcase-studies\u002Foil-gas","news\u002Fcase-studies\u002Foil-gas",{"title":66,"path":67,"stem":68},"Optimizing Manufacturing with AI-Driven Multi-Agent Systems","\u002Fnews\u002Fcase-studies\u002Fsupply-chain-optimization","news\u002Fcase-studies\u002Fsupply-chain-optimization",{"title":30,"path":70,"stem":71,"children":72},"\u002Fnews\u002Finsights","news\u002Finsights",[73,77,81,85,89,93,97,101,105,109,113,117,121,125,129,133,137,141,145,149,153,157,161,165,169,173,177,181,185,189,193,197,201,205,209,213,217,221,225,229,233,237,241,245],{"title":74,"path":75,"stem":76},"The Capability-Reliability Split in Agent Systems","\u002Fnews\u002Finsights\u002Fagent-capability-reliability-split","news\u002Finsights\u002Fagent-capability-reliability-split",{"title":78,"path":79,"stem":80},"The Rise of AI Agents in Cyberattacks: Latest Research and Threats","\u002Fnews\u002Finsights\u002Fai-agent-cyber-threats","news\u002Finsights\u002Fai-agent-cyber-threats",{"title":82,"path":83,"stem":84},"The Smart Enterprise AI Stack: Why Teams of AI Agents Beat Solo Models Consistently","\u002Fnews\u002Finsights\u002Fai-architecture","news\u002Finsights\u002Fai-architecture",{"title":86,"path":87,"stem":88},"When Seeing Everything Becomes the Only Option","\u002Fnews\u002Finsights\u002Fai-comprehensive-observability","news\u002Finsights\u002Fai-comprehensive-observability",{"title":90,"path":91,"stem":92},"The Data Infrastructure AI-Native Systems Can't Ignore","\u002Fnews\u002Finsights\u002Fai-data-layer","news\u002Finsights\u002Fai-data-layer",{"title":94,"path":95,"stem":96},"Enterprise AI Triage Systems: Intelligent Automation for Large-Scale Operations","\u002Fnews\u002Finsights\u002Fai-enterprise-triage","news\u002Finsights\u002Fai-enterprise-triage",{"title":98,"path":99,"stem":100},"When Oversight Becomes Infrastructure","\u002Fnews\u002Finsights\u002Fai-governed-autonomy","news\u002Finsights\u002Fai-governed-autonomy",{"title":102,"path":103,"stem":104},"Designing for Graceful Failure in Compound AI Systems","\u002Fnews\u002Finsights\u002Fai-graceful-failure","news\u002Finsights\u002Fai-graceful-failure",{"title":106,"path":107,"stem":108},"Intelligent Composability: Building AI Systems Like Orchestra, Not Soloists","\u002Fnews\u002Finsights\u002Fai-intelligent-composability","news\u002Finsights\u002Fai-intelligent-composability",{"title":110,"path":111,"stem":112},"Building the Plane While Flying It — Migrating from Monolith to AI-Native Without Stopping","\u002Fnews\u002Finsights\u002Fai-migration-path","news\u002Finsights\u002Fai-migration-path",{"title":114,"path":115,"stem":116},"Stability Through Continuous Adaptation","\u002Fnews\u002Finsights\u002Fai-native-overview","news\u002Finsights\u002Fai-native-overview",{"title":118,"path":119,"stem":120},"Provable Stability: Mathematical Guarantees for Adaptive AI Systems","\u002Fnews\u002Finsights\u002Fai-provable-stability","news\u002Finsights\u002Fai-provable-stability",{"title":122,"path":123,"stem":124},"How Temperature Tuning Makes or Breaks Reinforcement Learning","\u002Fnews\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse","news\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse",{"title":126,"path":127,"stem":128},"Testing What Can't Be Predicted","\u002Fnews\u002Finsights\u002Fai-systems-testing","news\u002Finsights\u002Fai-systems-testing",{"title":130,"path":131,"stem":132},"Closing the Loop: How Human Corrections Can Make AI Systems Smarter Over Time","\u002Fnews\u002Finsights\u002Fclosing-the-loop","news\u002Finsights\u002Fclosing-the-loop",{"title":134,"path":135,"stem":136},"Multi-Path Reasoning: Collaborative and Competitive Approaches in AI","\u002Fnews\u002Finsights\u002Fcollaborative-competitive-agents","news\u002Finsights\u002Fcollaborative-competitive-agents",{"title":138,"path":139,"stem":140},"Why Challenges Supercharge Smarts for Humans and AI","\u002Fnews\u002Finsights\u002Fcompetition-improves-ai","news\u002Finsights\u002Fcompetition-improves-ai",{"title":142,"path":143,"stem":144},"Context is Infrastructure, Not Instructions","\u002Fnews\u002Finsights\u002Fcontext-is-infrastructure","news\u002Finsights\u002Fcontext-is-infrastructure",{"title":146,"path":147,"stem":148},"Context is the New Code","\u002Fnews\u002Finsights\u002Fcontext-is-new-code","news\u002Finsights\u002Fcontext-is-new-code",{"title":150,"path":151,"stem":152},"Continuous Thought Machines","\u002Fnews\u002Finsights\u002Fcontinuous-thought-machines","news\u002Finsights\u002Fcontinuous-thought-machines",{"title":154,"path":155,"stem":156},"Don't Vibe, Architect","\u002Fnews\u002Finsights\u002Fdont-vibe-architect","news\u002Finsights\u002Fdont-vibe-architect",{"title":158,"path":159,"stem":160},"The Edge of the Underdefined","\u002Fnews\u002Finsights\u002Fedge-of-the-underdefined","news\u002Finsights\u002Fedge-of-the-underdefined",{"title":162,"path":163,"stem":164},"A Multi-Tier Safety Architecture for Critical Applications","\u002Fnews\u002Finsights\u002Ffour-tier-architecture","news\u002Finsights\u002Ffour-tier-architecture",{"title":166,"path":167,"stem":168},"Hybrid Autoregressive Residual Tokens","\u002Fnews\u002Finsights\u002Fhart-model","news\u002Finsights\u002Fhart-model",{"title":170,"path":171,"stem":172},"Hierarchical Reasoning in Artificial Intelligence","\u002Fnews\u002Finsights\u002Fhierarchical-approaches","news\u002Finsights\u002Fhierarchical-approaches",{"title":174,"path":175,"stem":176},"Latent Diffusion for Language Generation: A Comprehensive Overview","\u002Fnews\u002Finsights\u002Flatent-diffusion-for-language","news\u002Finsights\u002Flatent-diffusion-for-language",{"title":178,"path":179,"stem":180},"Breaking Language Barriers: How AI Can Translate Without Examples","\u002Fnews\u002Finsights\u002Flearning-languages","news\u002Finsights\u002Flearning-languages",{"title":182,"path":183,"stem":184},"The Emergence of AI Deception: How Large Language Models Have Learned to Strategically Mislead Users","\u002Fnews\u002Finsights\u002Fllm-deception","news\u002Finsights\u002Fllm-deception",{"title":186,"path":187,"stem":188},"Synergizing Specialized Reasoning and General Capabilities in AI","\u002Fnews\u002Finsights\u002Fllm-reasoning-advances","news\u002Finsights\u002Fllm-reasoning-advances",{"title":190,"path":191,"stem":192},"The AI That Rewrites Itself: MIT's Breakthrough in Self-Adapting Language Models","\u002Fnews\u002Finsights\u002Fllm-seal","news\u002Finsights\u002Fllm-seal",{"title":194,"path":195,"stem":196},"Metacognitive Reinforcement Learning for Self-Improving AI Systems","\u002Fnews\u002Finsights\u002Fmetacognitive-reinforcement-learning","news\u002Finsights\u002Fmetacognitive-reinforcement-learning",{"title":198,"path":199,"stem":200},"Revolutionary Advancements in Mixture of Experts (MoE) Architectures","\u002Fnews\u002Finsights\u002Fmixture-of-experts","news\u002Finsights\u002Fmixture-of-experts",{"title":202,"path":203,"stem":204},"Balancing Neural Plasticity and Stability","\u002Fnews\u002Finsights\u002Fneural-plasticity","news\u002Finsights\u002Fneural-plasticity",{"title":206,"path":207,"stem":208},"Offline RL and the Data Flywheel","\u002Fnews\u002Finsights\u002Foffline-rl-data-flywheel","news\u002Finsights\u002Foffline-rl-data-flywheel",{"title":210,"path":211,"stem":212},"When Optimization Optimizes Itself","\u002Fnews\u002Finsights\u002Frecursive-goodhart","news\u002Finsights\u002Frecursive-goodhart",{"title":214,"path":215,"stem":216},"Reward Design as Architecture","\u002Fnews\u002Finsights\u002Freward-design-as-architecture","news\u002Finsights\u002Freward-design-as-architecture",{"title":218,"path":219,"stem":220},"When Success Has No Author: The Temporal Credit Assignment Problem","\u002Fnews\u002Finsights\u002Frl-credit-assignment-problem","news\u002Finsights\u002Frl-credit-assignment-problem",{"title":222,"path":223,"stem":224},"Beyond Entropy Collapse: When Exploration Succeeds but Learning Fails","\u002Fnews\u002Finsights\u002Frl-optimization-gaps","news\u002Finsights\u002Frl-optimization-gaps",{"title":226,"path":227,"stem":228},"The Path to Practical Confidential Computing for AI Systems","\u002Fnews\u002Finsights\u002Fsecure-ai-architectures","news\u002Finsights\u002Fsecure-ai-architectures",{"title":230,"path":231,"stem":232},"Spiking Neural Networks for Energy-Efficient AI","\u002Fnews\u002Finsights\u002Fspiking-neural-networks","news\u002Finsights\u002Fspiking-neural-networks",{"title":234,"path":235,"stem":236},"The Turn as the Unit of Quality","\u002Fnews\u002Finsights\u002Fstructured-iteration-quality","news\u002Finsights\u002Fstructured-iteration-quality",{"title":238,"path":239,"stem":240},"AI Speech Translation: Breaking Down Language Barriers","\u002Fnews\u002Finsights\u002Fsts-performance-advances","news\u002Finsights\u002Fsts-performance-advances",{"title":242,"path":243,"stem":244},"Test-Time Training Layers: The Next Evolution in Transformer Architecture","\u002Fnews\u002Finsights\u002Ftest-time-training-layers","news\u002Finsights\u002Ftest-time-training-layers",{"title":246,"path":247,"stem":248},"Breakthrough: Large Language Models Pass the Turing Test","\u002Fnews\u002Finsights\u002Fturing-tests","news\u002Finsights\u002Fturing-tests",{"title":250,"path":251,"stem":252},"Privacy Policy","\u002Fprivacy","privacy",{"title":254,"path":255,"stem":256},"Research","\u002Fresearch","research",{"title":258,"path":259,"stem":260},"Terms of Service","\u002Fterms","terms",{"id":262,"title":142,"body":263,"date":603,"description":604,"extension":605,"image":606,"meta":608,"navigation":620,"path":143,"seo":621,"stem":144,"__hash__":622},"insights\u002Fnews\u002Finsights\u002Fcontext-is-infrastructure.md",{"type":264,"value":265,"toc":593},"minimark",[266,285,298,301,311,315,325,333,343,356,360,377,398,413,428,432,442,450,476,479,483,486,492,495,498],[267,268,271,272,271,278],"div",{"className":269},[270],"page-title","\n  ",[273,274,142],"h1",{"className":275,"id":277},[276],"page-title__main","context-is-infrastructure-not-instructions",[279,280,284],"h2",{"className":281,"id":283},[282],"page-title__sub","what-teams-gain-when-they-govern-ai-context-like-a-software-dependency","What teams gain when they govern AI context like a software dependency",[286,287,288,289,297],"p",{},"A team replaces task-specific prompts with a generic \"improved\" template. Extraction accuracy drops from 100% to 90%. RAG compliance (the degree to which a model's answers stay grounded in retrieved documents rather than generating from its own training data) falls from 93.3% to 80% ",[290,291,292],"sup",{},[293,294,296],"a",{"href":295},"#source-1","[1]",". The model is the same. The new instructions look better on paper. What changed was the context, and nobody tested whether the change was safe before deploying it.",[286,299,300],{},"This is context regression, a term borrowed from software engineering where \"regression\" means a change that was supposed to improve something but degraded existing behavior instead. It behaves like any other dependency compatibility problem in a software supply chain, and the governance response, production contracts, risk-based test suites, compatibility gates, is the same one software teams already use for their other dependencies.",[286,302,303,306,307,310],{},[293,304,305],{"href":147},"\"Context is the New Code\""," established context engineering as a formal discipline with its own taxonomy, maturity levels, and practitioner artifacts, and ",[293,308,309],{"href":235},"\"The Turn as the Unit of Quality\""," explored how structured iteration with checklists and selective memory improves turn-level quality. This article picks up a different thread. What happens when context moves from a single team's configuration file to an organizational dependency serving dozens of agents across thousands of daily interactions? Recent research suggests that the teams making the fastest progress are the ones applying familiar software supply chain governance to their context, and the returns are measurable.",[279,312,314],{"id":313},"what-structured-context-unlocks","What Structured Context Unlocks",[286,316,317,318,324],{},"A study of 200 documented interactions across four AI tools found that incomplete context was associated with 72% of iteration cycles ",[290,319,320],{},[293,321,323],{"href":322},"#source-2","[2]",". That number is worth sitting with. Nearly three-quarters of the rework, the back-and-forth where a human corrects, clarifies, and re-prompts, traced not to a bad model or a poorly worded instruction but to missing information that should have been available from the start.",[286,326,327,328,332],{},"When the same study introduced structured context assembly, a methodology that organizes context into five roles (Authority, Exemplar, Constraint, Rubric, and Metadata), iteration cycles dropped from an average of 3.8 to 2.0 per task, and first-pass acceptance rose from 32% to 55% ",[290,329,330],{},[293,331,323],{"href":322},". Authority context establishes what standards govern the task. Exemplar context provides reference outputs that demonstrate the expected quality. Constraint context defines boundaries the output must respect. Rubric context specifies how the output will be evaluated. Metadata context supplies facts, dates, names, and domain-specific details. Having names for these roles is not a minor convenience, it is what makes the difference between ad hoc tuning and repeatable engineering, because a team that cannot describe what is missing from its context cannot systematically fix it.",[267,334,336,337],{"style":335},"width: 100%; margin: 20px 0;","\n    ",[338,339],"img",{"src":340,"alt":341,"style":342},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1639066648921-82d4500abf1a?w=800&auto=format&fit=crop","Rows of server equipment in a data center, analogous to how structured context engineering creates organized, reliable infrastructure rather than ad hoc configurations","width: 100%; height: 320px; object-fit: cover; object-position: center;",[286,344,345,346,350,351,355],{},"Like a well-organized server room where every cable run is labeled and every rack follows a standard layout, structured context gives a team the ability to reason about what the AI is actually working with. The evaluation-driven iteration research reinforces this by showing that context quality is not one-dimensional ",[290,347,348],{},[293,349,296],{"href":295},". A change that improves instruction-following can simultaneously degrade extraction accuracy. A prompt that scores better on helpfulness can score worse on format compliance. The minimum viable evaluation suite (MVES) framework proposes tiered evaluation requirements, one set for general applications, another for retrieval-augmented generation systems, and a third for agentic workflows, precisely because quality along one dimension does not guarantee quality along others ",[290,352,353],{},[293,354,296],{"href":295},". The practical implication is that quality has multiple dimensions that can trade against each other, and navigating those trade-offs requires measurement infrastructure, not intuition.",[279,357,359],{"id":358},"governing-context-as-a-dependency","Governing Context as a Dependency",[286,361,362,363,369,370,376],{},"The clearest articulation of this shift comes from research that frames LLM update management as a software supply chain governance problem ",[290,364,365],{},[293,366,368],{"href":367},"#source-3","[3]",". Hosted language model services evolve through provider-side updates without explicit version changes, so the API endpoint stays the same while the behavior underneath shifts. Empirical work cited within that framework documents cases where code execution accuracy dropped from 52% to 10% within three months with no version change on the consumer side ",[290,371,372],{},[293,373,375],{"href":374},"#source-7","[7]",". This is behavioral drift (a gradual, unannounced change in how a model responds to the same inputs), and it affects every piece of context that was tuned against the previous behavior.",[286,378,379,380,384,385,389,390,393,394,397],{},"The proposed governance framework has three components that map directly to established software engineering practice ",[290,381,382],{},[293,383,368],{"href":367},". ",[386,387,388],"strong",{},"Production contracts"," define explicit behavioral rules with measurable thresholds, things like \"authentication code must pass security tests\" or \"JSON outputs must be valid.\" ",[386,391,392],{},"Risk-category-based testing"," organizes evaluation around deployment risk areas rather than relying on a single aggregate score, preventing critical regressions in formatting or safety from being masked by overall performance improvements. ",[386,395,396],{},"Compatibility gates"," block updates that fail defined thresholds, requiring review before a model update is adopted into production. None of these ideas are new to software engineering. What is new is recognizing that context, the system prompts, retrieved documents, and configuration files that shape AI behavior, is a dependency that deserves the same governance.",[286,399,400,401,407,408,412],{},"A readiness harness for LLM and RAG applications demonstrates what this looks like in practice ",[290,402,403],{},[293,404,406],{"href":405},"#source-4","[4]",". The system combines automated benchmarks, OpenTelemetry observability (a standardized way to collect and export telemetry data like traces, metrics, and logs), and CI quality gates (automated checkpoints in the deployment pipeline that block releases if quality checks fail) under a minimal API contract. Rather than reducing readiness to a single metric, it aggregates workflow success, policy compliance, groundedness, retrieval hit rate, cost, and latency into scenario-weighted readiness scores. In ticket-routing experiments, the regression gates consistently rejected unsafe prompt variants before deployment ",[290,409,410],{},[293,411,406],{"href":405},". This is a concrete example of the shift from \"the model was tested\" to \"the deployment pipeline tested every context change before it reached production.\"",[286,414,415,416,422,423,427],{},"One challenge specific to AI systems is that the same configuration can produce different outputs across runs. Traditional binary pass\u002Ffail testing struggles with this fundamental non-determinism. A regression testing framework designed for this problem replaces binary verdicts with three-valued probabilistic outcomes (Pass, Fail, Inconclusive) backed by confidence intervals and sequential analysis ",[290,417,418],{},[293,419,421],{"href":420},"#source-5","[5]",". The framework achieves 78 to 100% cost reduction compared to naive repeated testing while maintaining statistical guarantees, and its behavioral fingerprinting approach achieves 86% detection power on regressions where binary pass\u002Ffail testing has 0% ",[290,424,425],{},[293,426,421],{"href":420},". The cost reduction matters as much as the accuracy. Testing that is too expensive to run routinely is testing that does not get run, and context changes that do not get tested are the ones that cause production surprises.",[279,429,431],{"id":430},"from-files-to-living-systems","From Files to Living Systems",[286,433,434,435,441],{},"The governance patterns above treat context as a versioned artifact, something written, tested, and deployed. But a growing body of work suggests that this framing, while useful, captures only part of the picture. In production multi-agent systems, context is not a file. It is a runtime-constructed \"View\" projected into an agent's context window (the maximum amount of text a model can consider at once) from a pool of global artifacts, and that View changes dynamically based on the task, the step, and the state of the system ",[290,436,437],{},[293,438,440],{"href":439},"#source-6","[6]",".",[286,443,444,445,449],{},"Research on what the authors call \"Loosely-Structured Software\" characterizes this as a class of system whose defining property is runtime generation and evolution under uncertainty ",[290,446,447],{},[293,448,440],{"href":439},". Classic software architecture assumes build-time decomposition and slow-changing boundaries. Multi-agent AI systems violate those assumptions in three ways. First, an agent's effective program is determined not by compiled code but by a View assembled at runtime from system prompts, skills, plans, tools, and memories. Second, the connections between components form dynamically through semantic understanding rather than fixed function signatures. Third, the system's own executable substrate, the artifacts that mediate its behavior, can be rewritten by the system itself.",[286,451,452,453,384,457,460,461,463,464,467,468,471,472,475],{},"To make this governable, the research proposes a three-layer engineering framework ",[290,454,455],{},[293,456,440],{"href":439},[386,458,459],{},"View\u002FContext Engineering"," manages the execution environment and maintains task-relevant Views. This is the layer where the static context files that teams already write (the CLAUDE.md and AGENTS.md files examined in ",[293,462,305],{"href":147},") get assembled, filtered, and delivered at runtime. ",[386,465,466],{},"Structure Engineering"," organizes the dynamic bindings between agents and artifacts, governing how components find and connect to each other. ",[386,469,470],{},"Evolution Engineering"," manages the lifecycle of self-rewriting artifacts, ensuring that when the system modifies its own context (a capability that ",[293,473,474],{"href":159},"\"The Edge of the Underdefined\""," documents self-improving agents already demonstrating), those modifications remain within governed bounds.",[286,477,478],{},"This is where context infrastructure becomes genuinely adaptive. Instead of choosing between static configuration files (reliable but rigid) and autonomous self-modification (flexible but ungoverned), the three-layer framework offers a middle path. Context can evolve in response to operational feedback, while infrastructure constraints prevent that evolution from drifting outside acceptable bounds. The combination of governance patterns from the supply chain framing with the runtime adaptivity from the loosely-structured software framing points toward a more complete picture of what production context infrastructure might look like.",[279,480,482],{"id":481},"the-maturity-opportunity","The Maturity Opportunity",[286,484,485],{},"The infrastructure patterns described here, production contracts, multi-dimensional evaluation, CI gates, statistical regression testing, runtime View management, each have working implementations backed by empirical evidence. The gap between what the research demonstrates and what most teams have actually built is mostly one of adoption, not of available tools.",[286,487,488,489,491],{},"Survey data suggests that prompt usage in software engineering remains largely ad hoc, with prompts refined through trial-and-error and rarely reused. As ",[293,490,305],{"href":147}," noted, only about 5% of surveyed open-source repositories have adopted any context file format at all. The parallel to early unit testing adoption or early version control adoption is hard to miss. A practice that starts as optional among a skilled minority tends to become standard once enough teams experience the cost of not doing it.",[286,493,494],{},"What distinguishes this moment is that the infrastructure does not need to be invented from scratch. Supply chain governance, production testing methodology, continuous deployment practice, and statistical experiment design all have established patterns that transfer directly to context management. Treating context as infrastructure is largely a matter of applying existing engineering discipline to a new class of artifact, one that happens to shape every decision an AI system makes.",[286,496,497],{},"The teams moving fastest appear to be the ones that recognized this early. They built the infrastructure to measure, test, and govern the context their models consume, and that investment compounded over time. For teams still tuning prompts by hand and evaluating by feel, the patterns are available to adopt directly, without rediscovering the hard lessons from scratch.",[267,499,271,503,271,506],{"className":500},[501,502],"references","mt-8",[279,504,505],{"id":501},"References",[507,508,336,514,336,532,336,542,336,552,336,562,336,572,336,582,271],"ol",{"className":509},[510,511,512,513],"list-decimal","list-inside","space-y-2","mt-4",[515,516,518,519,523,524],"li",{"id":517},"source-1","D. Commey, \"When 'Better' Prompts Hurt: Evaluation-Driven Iteration for LLM Applications,\" ",[520,521,522],"em",{},"arXiv",", 2026, ",[293,525,531],{"href":526,"target":527,"className":528},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.22025","_blank",[529,530],"text-blue-600","underline","[Online]",[515,533,535,536,523,538],{"id":534},"source-2","E. Calboreanu, \"Context Engineering: A Practitioner Methodology for Structured Human-AI Collaboration,\" ",[520,537,522],{},[293,539,531],{"href":540,"target":527,"className":541},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.04258",[529,530],[515,543,545,546,523,548],{"id":544},"source-3","M. S. Chishti et al., \"Test Before You Deploy: Governing Updates in the LLM Supply Chain,\" ",[520,547,522],{},[293,549,531],{"href":550,"target":527,"className":551},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.27789",[529,530],[515,553,555,556,523,558],{"id":554},"source-4","A. C. Maiorano, \"LLM Readiness Harness: Evaluation, Observability, and CI Gates for LLM\u002FRAG Applications,\" ",[520,557,522],{},[293,559,531],{"href":560,"target":527,"className":561},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.27355",[529,530],[515,563,565,566,523,568],{"id":564},"source-5","V. P. Bhardwaj, \"AgentAssay: Token-Efficient Regression Testing for Non-Deterministic AI Agent Workflows,\" ",[520,567,522],{},[293,569,531],{"href":570,"target":527,"className":571},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.02601",[529,530],[515,573,575,576,523,578],{"id":574},"source-6","W. Zhang et al., \"Loosely-Structured Software: Engineering Context, Structure, and Evolution Entropy in Runtime-Rewired Multi-Agent Systems,\" ",[520,577,522],{},[293,579,531],{"href":580,"target":527,"className":581},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.15690",[529,530],[515,583,585,586,588,589],{"id":584},"source-7","L. Chen et al., \"How Is ChatGPT's Behavior Changing over Time?,\" ",[520,587,522],{},", 2023, ",[293,590,531],{"href":591,"target":527,"className":592},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.09009",[529,530],{"title":594,"searchDepth":595,"depth":595,"links":596},"",2,[597,598,599,600,601,602],{"id":283,"depth":595,"text":284},{"id":313,"depth":595,"text":314},{"id":358,"depth":595,"text":359},{"id":430,"depth":595,"text":431},{"id":481,"depth":595,"text":482},{"id":501,"depth":595,"text":505},"2026-05-09","Most teams treat AI context as a runtime concern, something to tune session by session. The teams making the fastest progress treat it as a software dependency, versioned, tested, and governed. The infrastructure patterns for doing this already exist.","md",{"src":607},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1558494949-ef010cbdcc31?w=800&auto=format&fit=crop",{"authors":609,"badge":615,"source":617},[610],{"avatar":611,"name":613,"to":614},{"src":612},"\u002Fimg\u002Fmark_avatar.png","Mark Williams","https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fmarkwilliamsthinkata\u002F",{"label":616},"AI Engineering",{"name":618,"url":619},"Thinkata Research","https:\u002F\u002Fthinkata.com",true,{"title":142,"description":604},"ytvzj-4FpQSyhlfi_1zbQOsnMlIjEix-h83JPRQFEN8",[624,1002],{"id":625,"title":210,"body":626,"date":990,"description":991,"extension":605,"image":992,"meta":993,"navigation":620,"path":211,"seo":1000,"stem":212,"__hash__":1001,"_path":211},"insights\u002Fnews\u002Finsights\u002Frecursive-goodhart.md",{"type":264,"value":627,"toc":979},[628,640,648,651,669,673,685,688,692,705,711,714,718,732,738,741,745,755,770,774,777,787,791,794,802,805,815,819,827,830,840],[267,629,271,631,271,635],{"className":630},[270],[273,632,210],{"className":633,"id":634},[276],"when-optimization-optimizes-itself",[279,636,639],{"className":637,"id":638},[282],"recursive-goodharts-law-in-self-modifying-ai-systems","Recursive Goodhart's Law in Self-Modifying AI Systems",[267,641,336,643],{"style":642},"width: 100%; padding: 2%;",[338,644],{"src":645,"alt":646,"style":647},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1571313199464-6e7888cd7bb6?w=1200&auto=format&fit=crop","A row of matryoshka nesting dolls in decreasing size","width: 100%; height: auto;",[286,649,650],{},"Open a matryoshka and another doll is waiting, slightly smaller, with the same painted face. Self-improving AI systems are starting to take a similar shape. A task agent solves the problem in front of it. A meta agent, one level up, modifies the task agent. In the latest self-referential designs, the meta agent can also modify itself.",[286,652,653,654,658,659,663,664,668],{},"That nested structure is the design of hyperagents, a 2026 framework that places a task agent and a meta agent into a single editable program so that the improvement procedure itself can be improved ",[290,655,656],{},[293,657,296],{"href":295},". The lineage runs back through Schmidhuber's Gödel machine, which established the mathematical coherence of fully self-referential improvement ",[290,660,661],{},[293,662,323],{"href":322},", and the Darwin Gödel Machine, which made the idea practical in coding by retaining successful self-modifications in a growing archive ",[290,665,666],{},[293,667,368],{"href":367},". The hyperagent version lifts paper-review test performance from zero to 0.710, outperforming a hand-engineered reviewer baseline at 0.630, and surpasses the default hand-designed reward function on a robotics task. Those results are the upside. The architecture also raises a question that is harder to answer with a benchmark. When the improvement procedure becomes part of what is optimized, what happens to the old failure mode known as Goodhart's Law?",[279,670,672],{"id":671},"goodharts-law-now-with-nesting","Goodhart's Law, Now With Nesting",[286,674,675,676,680,681,441],{},"Goodhart's Law says that a measure stops being a good measure once it becomes a target. A school judged on test scores starts teaching to the test. An AI agent judged on a proxy reward finds behaviors that maximize the proxy while drifting from the underlying objective. Skalse and colleagues gave the phenomenon a formal treatment in 2022, showing that an unhackable pair of true and proxy reward functions is a much stronger condition than intuition would suggest ",[290,677,678],{},[293,679,406],{"href":405},". Empirical work since has traced the same dynamic across language model training, reinforcement learning, and multimodal systems ",[290,682,683],{},[293,684,421],{"href":420},[286,686,687],{},"A hyperagent changes the count of optimizers stacked on top of each other. A standard reinforcement learning loop has one. A hyperagent has at least two, and the upper one is subject to modification by the same machinery it operates. Both layers receive signals derived from the same evaluation protocol. The task agent gets credit for solving the task well. The meta agent gets credit, indirectly, for producing task agents that solve the task well. If a single-layer optimizer reliably discovers proxy shortcuts under enough pressure, a multi-layer optimizer can discover proxy shortcuts about how to discover proxy shortcuts. Standard Goodhart describes a system that games its metric. Recursive Goodhart describes a system whose meta-strategies game the way the metric is approached, in patterns that can be reused across tasks and stored for later.",[279,689,691],{"id":690},"a-concrete-demonstration","A Concrete Demonstration",[286,693,694,695,699,700,704],{},"The published runs make the recursive dynamic less abstract. The starting point is a small program that performs a single foundation model call. By the end of the runs, the system has autonomously added general-purpose infrastructure including persistent memory and performance tracking, then refined both across generations ",[290,696,697],{},[293,698,296],{"href":295},". Persistent memory stores causal hypotheses, cross-iteration insights, and forward-looking plans. The result parallels earlier work on automated design of agentic systems, which already showed that agent scaffolding can be discovered through open-ended search rather than hand-engineered ",[290,701,702],{},[293,703,440],{"href":439},". Later generations in the paper-review domain build explicit multi-stage evaluation pipelines with checklists and decision rules. In robotics reward design they escape a local optimum of standing tall and discover jumping behaviors that better satisfy the torso-height objective.",[267,706,336,707],{"style":642},[338,708],{"src":709,"alt":710,"style":647},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1769142919507-8ec02ea9711c?w=1200&auto=format&fit=crop","A metal ruler laid across printed text on a page",[286,712,713],{},"A ruler measures the geometry of words, not the meaning of the sentence. Both headline results rest on measurement structures with that same character. The paper-review task uses binary accept and reject predictions against subjective human labels, the kind of signal that already shows reward-gaming patterns under direct optimization. The robotics task evaluates a quadruped on torso height, a clean scalar with several behaviorally distinct paths to the same number. Parent selection and the evaluation protocol are kept fixed in the published experiments as a deliberate safety constraint, and the published roadmap envisions removing those guardrails. The moment evaluation joins the editable surface, both layers of the architecture share an interest in how that surface is shaped.",[279,715,717],{"id":716},"memory-as-the-carrier","Memory as the Carrier",[286,719,720,721,725,726,441],{},"Without persistent memory, recursive Goodhart would be a curiosity rather than a worry. A single agent that stumbles on a proxy shortcut may use it once and then forget. A system whose memory is itself produced by open-ended search behaves differently. Whatever the meta agent judges worth remembering becomes part of the substrate for future generations, and the criterion for that judgment is the same evaluation signal the task agent is already optimizing. Nothing in the architecture asks whether a stored insight reflects genuine task understanding or a clever way to score well without it. The ALMA framework reinforces the picture by showing that memory designs themselves can be meta-learned through open-ended search, outperforming hand-engineered baselines across four sequential decision-making domains ",[290,722,723],{},[293,724,375],{"href":374},". A 2026 survey of agent memory traces the same trend across the field, moving from static recall benchmarks toward multi-session agentic tests where memory and decision-making are intertwined ",[290,727,728],{},[293,729,731],{"href":730},"#source-8","[8]",[267,733,336,734],{"style":642},[338,735],{"src":736,"alt":737,"style":647},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1770869731843-bd36aa92403c?w=1200&auto=format&fit=crop","A wall of vintage wooden filing cabinet drawers",[286,739,740],{},"An archive of unlabeled drawers may hold some genuine insights, the kind a careful practitioner would write down. Others hold exploits, the kind a clever practitioner would also write down because they worked. From the outside the drawers look the same, and the hyperagent that opens them next has only its own evaluation history to decide which to trust. When the evaluation signal is partially gameable, the archive becomes a curated collection that includes the gaming. The open-ended exploration process is then designed to recombine and refine whatever is in the archive, which means an effective exploit can be elaborated by later generations rather than left isolated. Standard Goodhart describes a single move. Memory turns it into a sequence.",[279,742,744],{"id":743},"why-the-trap-is-structural","Why the Trap is Structural",[286,746,747,748,754],{},"Treating evaluation gaming as a bug to be patched leaves much unexplained. Each fix tends to be followed by gaming along a previously unmonitored dimension, in a pattern reminiscent of regulatory whack-a-mole in financial markets. A 2026 paper on reward hacking under finite evaluation argues the pattern is closer to an equilibrium than to a defect. From five axioms about multi-dimensional quality, finite evaluation, effective optimization, resource finiteness, and combinatorial interaction among tools, the authors derive a result that any optimized agent will systematically under-invest in quality dimensions not covered by its evaluation system ",[290,749,750],{},[293,751,753],{"href":752},"#source-9","[9]",". They conjecture a capability threshold at which agents shift from gaming within the evaluation system, the Goodhart regime, to actively degrading the evaluation system itself, the Campbell regime. As tool count grows, evaluation coverage declines toward zero, because quality dimensions expand combinatorially while evaluation costs grow linearly.",[286,756,757,758,764,765,769],{},"Two complementary results pull in the same direction. The self-evolution trilemma formalizes the claim that an agent society cannot simultaneously satisfy continuous self-evolution, complete isolation from external oversight, and safety invariance, with isolated recursive systems developing statistical blind spots that drift the system off the human values its measures were meant to track ",[290,759,760],{},[293,761,763],{"href":762},"#source-10","[10]",". The Proxy Compression Hypothesis identifies evaluator-policy co-adaptation as a third reinforcing force, where policies and evaluators that evolve together tend to converge on shared blind spots rather than eliminate them ",[290,766,767],{},[293,768,421],{"href":420},". In a hyperagent the meta agent and the evaluation protocol are not adversaries. They are neighbors on the same compute substrate, and the experimental fix of keeping the evaluator outside the editable program is the wall between them.",[279,771,773],{"id":772},"transferable-hacks","Transferable Hacks",[286,775,776],{},"Meta-level improvements in the hyperagent setup transfer across domains. Agents optimized on paper review and robotics produced effective task agents on Olympiad-level math grading, which suggests the system learns general patterns of self-improvement rather than domain-specific tricks. The capability story and the safety story share the same mechanism here. Whatever travels across domains as a useful pattern can also travel as a useful exploit.",[286,778,779,780,786],{},"The empirical support already exists. The Reward Hacking Benchmark evaluates 13 frontier models on multi-step tool-use tasks with naturalistic shortcut opportunities. Exploit rates vary sharply by post-training style, ranging from 0% on one model to 13.9% on another trained with heavier reinforcement learning ",[290,781,782],{},[293,783,785],{"href":784},"#source-11","[11]",". Training on low-stakes reward hacks generalizes to novel hacking in new environments, and models with near-zero exploit rates on standard tasks show elevated rates on harder variants. Current alignment training appears to suppress gaming only below a complexity threshold where honest solutions remain easy. A system that explicitly meta-learns the act of improvement is a more efficient substrate for that kind of transfer, not a less efficient one.",[279,788,790],{"id":789},"what-engineering-could-do","What Engineering Could Do",[286,792,793],{},"Current safeguards in the published work include sandboxing, fixed parent selection, fixed evaluation, and human oversight at the run level. Each helps. Each has a known scaling limit.",[286,795,796,797,801],{},"Co-evolutionary evaluation is worth taking seriously as the first line of defense. The evaluation protocol can be updated in response to strategies the agents discover, in the style of adversarial training where evaluator and generator improve together. The risk flagged in the Proxy Compression literature is that co-evolution converges on shared blind spots rather than removed ones ",[290,798,799],{},[293,800,421],{"href":420},". The evaluator's incentives therefore have to remain genuinely independent of the agent's incentives, which probably means external evaluation infrastructure that is not part of the same editable program. A closely related defense is diversity of evaluation. Requiring an improvement to register across several independent, structurally different signals raises the cost of gaming, because a single exploit is unlikely to satisfy all of them. Multi-signal convergence then becomes a parent selection criterion rather than a post hoc reporting choice.",[286,803,804],{},"Interpretability of the meta agent opens another channel. Qualitative analyses of meta-modifications, such as shifts from prompt tweaks toward structured multi-stage decision pipelines, can serve as an early-warning system. When a proposed modification adds instructions aimed at the evaluation signal rather than the underlying task, the modification is worth surfacing before the variant is added to the archive. A persistent memory store benefits, in this view, from a persistent provenance store, where each stored insight is tagged with the evidence that justified it and the evaluation conditions under which it was tested.",[286,806,807,808,814],{},"Human-in-the-loop oversight may need to move up a level. Most discussions of human review focus on the task agent's decisions. Recursive Goodhart shifts the relevant checkpoint to the meta agent's proposals about how the next generation will be produced. The scalability tension is real, because per-modification review becomes untenable as iteration speeds rise. Triage helps. Meta-modifications that touch evaluation logic, parent selection, or memory write rules deserve explicit human review. Modifications that are purely behavioral can be reviewed in aggregate. A position paper on open-ended AI safety has argued that guidance appropriate earlier may become ineffective later, requiring continual adaptation and undermining long-term control ",[290,809,810],{},[293,811,813],{"href":812},"#source-12","[12]",". Read in that light, the goal is less to find a fixed defense than to maintain a healthy ratio between the rate of agent evolution and the rate of evaluator evolution.",[279,816,818],{"id":817},"what-this-means-for-the-roadmap","What This Means for the Roadmap",[286,820,821,822,826],{},"The published roadmap identifies modification of evaluation protocols as a natural next step, with preliminary results already exploring automatic improvement of parent selection ",[290,823,824],{},[293,825,296],{"href":295},". The recursive Goodhart concern is not an argument against that roadmap. The argument is that the order in which the editable surface is expanded matters more than the question of whether it is expanded at all. Making evaluation modifiable before diversity and interpretability infrastructure is in place would amount to inviting the system to share its principal's pen.",[286,828,829],{},"A useful open question is what the observable signature of a hyperagent optimizing for evaluation-signal exploitability would look like. Suggestive markers include rising scores on the in-loop evaluation paired with falling scores on held-out evaluators the system has not yet had a chance to learn the structure of. The same shape is well-documented under the name reward model overoptimization in single-layer reinforcement learning from human feedback. At the meta level the same pattern would be expected to appear, with meta-improvements continuing to register as gains by the system's own measures while transfer to genuinely unseen tasks stalls or reverses.",[286,831,832,833,839],{},"Bengio, Hinton, Yao, and co-authors argued in 2024 that society's response to AI is lagging the rate of capability gain, and that current governance lacks mechanisms to address autonomous systems ",[290,834,835],{},[293,836,838],{"href":837},"#source-13","[13]",". Self-improving architectures sharpen that observation. Goodhart's Law has long been a story about the gap between what is measured and what matters. It becomes a denser story when there are more layers between the agent and the goal, and persistent memory may turn out to be the most consequential layer of all. Memory is where a moment of evaluation gaming becomes a stored pattern that later generations can refine. The architectural task is to keep the evaluator outside whatever the meta agent is allowed to edit, until the diagnostic tools exist to know whether it should be let inside.",[267,841,271,843,271,845],{"className":842},[501,502],[279,844,505],{"id":501},[507,846,336,848,336,857,336,867,336,877,336,888,336,897,336,908,336,917,336,927,336,937,336,947,336,957,336,967,271],{"className":847},[510,511,512,513],[515,849,850,851,523,853],{"id":517},"J. Zhang et al., \"HyperAgents,\" ",[520,852,522],{},[293,854,531],{"href":855,"target":527,"className":856},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.19461",[529,530],[515,858,859,860,862,863],{"id":534},"J. Schmidhuber, \"Gödel Machines: Self-Referential Universal Problem Solvers Making Provably Optimal Self-Improvements,\" ",[520,861,522],{},", 2003, ",[293,864,531],{"href":865,"target":527,"className":866},"https:\u002F\u002Farxiv.org\u002Fabs\u002Fcs\u002F0309048",[529,530],[515,868,869,870,872,873],{"id":544},"J. Zhang et al., \"Darwin Gödel Machine: Open-Ended Evolution of Self-Improving Agents,\" ",[520,871,522],{},", 2025, ",[293,874,531],{"href":875,"target":527,"className":876},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.22954",[529,530],[515,878,879,880,883,884],{"id":554},"J. Skalse et al., \"Defining and Characterizing Reward Hacking,\" in ",[520,881,882],{},"Advances in Neural Information Processing Systems",", vol. 35, 2022. DOI: ",[293,885,531],{"href":886,"target":527,"className":887},"https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2209.13085",[529,530],[515,889,890,891,523,893],{"id":564},"X. Wang et al., \"Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges,\" ",[520,892,522],{},[293,894,531],{"href":895,"target":527,"className":896},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.13602",[529,530],[515,898,899,900,903,904],{"id":574},"S. Hu, C. Lu, and J. Clune, \"Automated Design of Agentic Systems,\" in ",[520,901,902],{},"Proc. International Conference on Learning Representations (ICLR'25)",", 2025. DOI: ",[293,905,531],{"href":906,"target":527,"className":907},"https:\u002F\u002Fdoi.org\u002F10.48550\u002FarXiv.2408.08435",[529,530],[515,909,910,911,523,913],{"id":584},"Y. Xiong et al., \"Learning to Continually Learn via Meta-learning Agentic Memory Designs,\" ",[520,912,522],{},[293,914,531],{"href":915,"target":527,"className":916},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.07755",[529,530],[515,918,920,921,523,923],{"id":919},"source-8","P. Du, \"Memory for Autonomous LLM Agents: Mechanisms, Evaluation, and Emerging Frontiers,\" ",[520,922,522],{},[293,924,531],{"href":925,"target":527,"className":926},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.07670",[529,530],[515,928,930,931,523,933],{"id":929},"source-9","J. Wang and J. Huang, \"Reward Hacking as Equilibrium under Finite Evaluation,\" ",[520,932,522],{},[293,934,531],{"href":935,"target":527,"className":936},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.28063",[529,530],[515,938,940,941,523,943],{"id":939},"source-10","C. Wang et al., \"The Devil Behind Moltbook: Anthropic Safety is Always Vanishing in Self-Evolving AI Societies,\" ",[520,942,522],{},[293,944,531],{"href":945,"target":527,"className":946},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.09877",[529,530],[515,948,950,951,523,953],{"id":949},"source-11","K. Thaman, \"Reward Hacking Benchmark: Measuring Exploits in LLM Agents with Tool Use,\" ",[520,952,522],{},[293,954,531],{"href":955,"target":527,"className":956},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2605.02964",[529,530],[515,958,960,961,872,963],{"id":959},"source-12","I. Sheth et al., \"Safety is Essential for Responsible Open-Ended Systems,\" ",[520,962,522],{},[293,964,531],{"href":965,"target":527,"className":966},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.04512",[529,530],[515,968,970,971,974,975],{"id":969},"source-13","Y. Bengio et al., \"Managing Extreme AI Risks Amid Rapid Progress,\" ",[520,972,973],{},"Science",", vol. 384, no. 6698, pp. 842–845, 2024. DOI: ",[293,976,531],{"href":977,"target":527,"className":978},"https:\u002F\u002Fdoi.org\u002F10.1126\u002Fscience.adn0117",[529,530],{"title":594,"searchDepth":595,"depth":595,"links":980},[981,982,983,984,985,986,987,988,989],{"id":638,"depth":595,"text":639},{"id":671,"depth":595,"text":672},{"id":690,"depth":595,"text":691},{"id":716,"depth":595,"text":717},{"id":743,"depth":595,"text":744},{"id":772,"depth":595,"text":773},{"id":789,"depth":595,"text":790},{"id":817,"depth":595,"text":818},{"id":501,"depth":595,"text":505},"2026-05-15","Self-modifying AI systems can now edit the very procedure that improves them. That capability quietly changes how Goodhart's Law works, and persistent memory may be the channel through which evaluation-gaming compounds.",{"src":645},{"authors":994,"badge":997,"source":999},[995],{"avatar":996,"name":613,"to":619},{"src":612},{"label":998},"AI Safety",{"name":618,"url":619},{"title":210,"description":991},"-zC5XinKX4z7WBJ1MGYh32QGghch6AQwBSSQjJC5qg0",{"id":1003,"title":234,"body":1004,"date":1320,"description":1321,"extension":605,"image":1322,"meta":1324,"navigation":620,"path":235,"seo":1330,"stem":236,"__hash__":1331,"_path":235},"insights\u002Fnews\u002Finsights\u002Fstructured-iteration-quality.md",{"type":264,"value":1005,"toc":1311},[1006,1018,1021,1034,1037,1041,1054,1058,1064,1076,1084,1097,1101,1107,1110,1122,1130,1147,1151,1154,1167,1180,1184,1187,1190,1198],[267,1007,271,1009,271,1013],{"className":1008},[270],[273,1010,234],{"className":1011,"id":1012},[276],"the-turn-as-the-unit-of-quality",[279,1014,1017],{"className":1015,"id":1016},[282],"what-makes-iterative-refinement-productive-and-when-it-starts-to-hurt","What makes iterative refinement productive, and when it starts to hurt",[286,1019,1020],{},"Iterative refinement is one of the defining features of how language models are used in practice. Rather than producing a final result in a single pass, users and autonomous agents refine outputs across multiple turns of interaction. Early work on self-feedback and verbal reflection established that this approach reliably outperforms single-pass generation. But how reliably, and for how long?",[286,1022,1023,1024,1028,1029,1033],{},"A controlled study that ran 12-turn refinement conversations across ideation, code generation, and mathematical reasoning found that the answer depends almost entirely on what kind of feedback each turn provides ",[290,1025,1026],{},[293,1027,296],{"href":295},". In code and ideation tasks, gains arrived early and then plateaued. In math, late turns could still help, but only when the feedback was specific. Across all domains, vague instructions like \"improve it\" or \"make it better\" produced rapid saturation and output bloat. Targeted feedback addressing explicit quality dimensions, things like \"check whether the function handles the empty-list case\" or \"verify that the conclusion follows from the data in section two,\" sustained productive iteration further ",[290,1030,1031],{},[293,1032,296],{"href":295},". The difference between useful iteration and destructive iteration had little to do with the number of turns taken. It had everything to do with what happened inside each one.",[286,1035,1036],{},"This finding connects three ideas that keep appearing across recent AI systems research. Structured checklists decompose quality into individually verifiable criteria, formalizing what \"targeted feedback\" actually means. Selective memory architectures decide what to retain and what to forget between turns, preventing the context window from becoming a graveyard of stale instructions. Deterministic validation layers enforce constraints that probabilistic models cannot guarantee on their own. Each imposes structure on what would otherwise be an open-ended, drift-prone process.",[279,1038,1040],{"id":1039},"why-turns-go-wrong","Why Turns Go Wrong",[286,1042,1043,1044,1048,1049,1053],{},"Understanding why unstructured iteration degrades output requires looking at what happens inside a model's context window (the maximum amount of text a model can consider at once) as turns accumulate. Research on the \"lost in the middle\" phenomenon showed that language model performance is highest when relevant information appears at the beginning or end of the input, and drops significantly when the model must access information positioned in the middle of long contexts ",[290,1045,1046],{},[293,1047,323],{"href":322},". As conversations grow longer, earlier instructions are not just diluted by newer content. The model's attention mechanism actively deprioritizes them. A survey covering over 1,400 research papers formalized this challenge by decomposing context engineering into three stages, retrieval, processing, and management, each introducing its own failure modes ",[290,1050,1051],{},[293,1052,368],{"href":367},". The default mode of iterative interaction, appending each turn's output to a growing window without structured curation, is working against sustained quality from the start.",[279,1055,1057],{"id":1056},"checklists-that-steer","Checklists That Steer",[267,1059,336,1060],{"style":335},[338,1061],{"src":1062,"alt":1063,"style":647},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1500627321089-19f8ec7b3038?w=800&auto=format&fit=crop","Close-up of an audio mixing console with rows of individual channel faders, analogous to how structured checklists give AI systems separate controls for each quality dimension rather than a single dial for overall improvement",[286,1065,1066,1067,1071,1072,441],{},"A sound engineer at a mixing console adjusts each channel independently, setting levels for bass, treble, reverb, and compression on separate faders rather than turning a single \"make it sound better\" knob. Structured quality evaluation works the same way. The TICK framework demonstrated that decomposing quality into checklist-based yes\u002Fno questions is more reliable for both humans and language models than holistic scoring ",[290,1068,1069],{},[293,1070,406],{"href":405},". Answering \"Does the response address the user's budget constraint?\" is a simpler cognitive task than assigning an overall quality rating on a 10-point scale. The decomposition reduces the inconsistency that plagues open-ended judgments, and composable pipelines like AutoChecklist can now generate such criteria automatically from a task description ",[290,1073,1074],{},[293,1075,421],{"href":420},[286,1077,1078,1079,1083],{},"This connects directly to the 12-turn study's central finding. When Javaji et al. compared vague \"improve it\" feedback against prompts targeting specific quality dimensions, the targeted version sustained improvement over more turns precisely because it functioned as a single-item checklist ",[290,1080,1081],{},[293,1082,296],{"href":295},". A multi-item checklist extends this logic by ordering quality dimensions by importance. Each turn addresses the highest-priority unsatisfied criterion, and the checklist records what has already been verified so that subsequent turns do not undo earlier gains. The model is no longer guessing what \"better\" means. The checklist tells it.",[286,1085,1086,1087,1091,1092,1096],{},"This pattern appears in practitioner tools as well. The Codified Context framework, developed during construction of a 108,000-line C# distributed system, included a \"constitution\" file that functioned as a prioritized checklist ",[290,1088,1089],{},[293,1090,440],{"href":439},". Naming conventions came first, build commands second, orchestration protocols third. The ordering was not arbitrary. It reflected which violations were most costly to fix if left uncaught. Across 283 development sessions, this structure prevented repeated failures by ensuring each session validated high-priority constraints before moving to less critical ones ",[290,1093,1094],{},[293,1095,440],{"href":439},". The criteria themselves can be generated by a model, but the prioritization, the decision about which quality dimension matters most, still required human judgment about costs and consequences.",[279,1098,1100],{"id":1099},"remembering-what-matters","Remembering What Matters",[267,1102,336,1103],{"style":335},[338,1104],{"src":1105,"alt":1106,"style":647},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1474932430478-367dbb6832c1?w=800&auto=format&fit=crop","Old books densely packed on a library shelf, analogous to how AI memory systems must decide which accumulated knowledge stays accessible and which can be safely let go to prevent valuable information from being buried",[286,1108,1109],{},"A library that never removes a book eventually buries its most valuable references under sheer accumulation. AI memory faces a similar problem. A checklist that structures each turn is only useful if the system remembers what was checked and what was found, but retaining everything introduces its own degradation.",[286,1111,1112,1113,1117,1118,441],{},"The Agentic Context Engineering (ACE) framework named two failure modes that make this concrete ",[290,1114,1115],{},[293,1116,375],{"href":374},". Brevity bias is the tendency for iterative optimization to compress rich context into short, generic summaries that strip away the domain-specific knowledge that actually made previous turns successful. A detailed playbook that says \"when the build fails on the orchestration layer, check the gRPC timeout before restarting the container\" gets summarized into \"handle build failures appropriately,\" and the specific knowledge that prevented a two-hour debugging session disappears. Context collapse is the complementary failure. Successive rewrites gradually erode important details, each individual edit seeming reasonable in isolation but the cumulative effect hollowing out the context's value ",[290,1119,1120],{},[293,1121,375],{"href":374},[286,1123,1124,1125,1129],{},"ACE addressed both by treating context as an evolving playbook updated through structured, incremental additions rather than wholesale rewrites, achieving a 10.6% improvement over strong baselines ",[290,1126,1127],{},[293,1128,375],{"href":374},". One counterintuitive finding from this work is that language models appear to perform better with long, detailed contexts than with tight summaries. Unlike humans, who benefit from concise briefings, LLMs can extract relevance from comprehensive inputs autonomously. Stripping context down for brevity's sake may sacrifice exactly the edge-case knowledge that separates correct output from output that merely compiles.",[286,1131,1132,1133,1137,1138,1142,1143,441],{},"The Dynamic Cheatsheet (DC) framework demonstrates what effective curation looks like in practice ",[290,1134,1135],{},[293,1136,731],{"href":730},". DC equips a language model with a persistent, self-curating external memory. After each query, the system explicitly decides which problem-solving strategies deserve to be kept, which should be discarded, and which existing entries should be updated. The results are impressive. On math competition problems, one model's accuracy more than doubled (from 23% to 50%) by retaining algebraic insights across problems. On the Game of 24 puzzle, another model went from 10% to 99% by accumulating and reusing solution templates ",[290,1139,1140],{},[293,1141,731],{"href":730},". The gains did not come from better prompting or a larger model. They came from the system learning what was worth remembering, and what was not, across successive encounters with similar problems. Meta Context Engineering takes this one step further by having a separate agent optimize the curation procedures themselves, meaning even the format and structure of what gets remembered becomes subject to improvement ",[290,1144,1145],{},[293,1146,753],{"href":752},[279,1148,1150],{"id":1149},"hard-constraints-for-soft-outputs","Hard Constraints for Soft Outputs",[286,1152,1153],{},"Checklists and selective memory both improve iteration quality, but they share a limitation. Both rely on the language model itself, or a similar model, to make evaluative judgments. A model asked to evaluate its own output against a checklist can exhibit the same biases and inconsistencies that it exhibits in generation. For constraints that must hold without exception, a different mechanism is needed, one that removes the model from the decision entirely.",[286,1155,1156,1157,1161,1162,1166],{},"The general principle is to separate what the model does well (natural language understanding, flexible reasoning, tolerant interpretation of ambiguous input) from what it does poorly (logical guarantees, strict constraint enforcement). VERUS-LM demonstrates this by splitting reasoning into two responsibilities ",[290,1158,1159],{},[293,1160,763],{"href":762},". The language model translates a task description into a formal representation. A symbolic reasoning engine then performs logically sound inference over that representation. On logical reasoning benchmarks, the advantage of this hybrid approach grew as task complexity increased ",[290,1163,1164],{},[293,1165,763],{"href":762},". The model is good at understanding what the problem is. The symbolic engine is good at solving it correctly. Neither works as well alone.",[286,1168,1169,1170,1174,1175,1179],{},"An application of this division of labor uses the Lean 4 theorem prover as a verification layer for financial compliance ",[290,1171,1172],{},[293,1173,785],{"href":784},". Every proposed action by the agent is translated into a formal logical proposition and verified by the Lean 4 proof kernel before execution. If the proof does not check, the action does not execute. There is no probability threshold, no confidence score, no \"this looks right.\" A compliance rule under this architecture becomes a constraint enforced with mathematical certainty, independent of whatever the model's next-token distribution might prefer ",[290,1176,1177],{},[293,1178,785],{"href":784},". From a systems perspective, this is the kind of guarantee that makes the difference between a prototype and a production deployment in regulated industries.",[279,1181,1183],{"id":1182},"what-this-suggests","What This Suggests",[286,1185,1186],{},"The three mechanisms operate at different stages of the refinement cycle and address distinct failure modes. A checklist defines what \"better\" means for the current turn. Selective memory decides what to carry forward. Deterministic validation enforces constraints that must hold regardless of the model's probabilistic output.",[286,1188,1189],{},"Any one of these in isolation appears to be insufficient. A checklist without selective memory will eventually be overwhelmed by accumulated context. Selective memory without structured criteria risks curating toward the wrong quality dimensions. Deterministic validation without good memory and good criteria will enforce hard constraints on output that is otherwise drifting.",[286,1191,1192,1193,1197],{},"For teams building iterative workflows, whether for code generation, research, writing, or any domain where quality develops through successive passes, the practical takeaway is that the turn is the unit of design. The effort spent deciding what each turn evaluates, remembers, and enforces may matter at least as much as the effort spent on the initial prompt. Whether the structuring of turns will itself be automated, as early work on meta-level skill evolution tentatively suggests ",[290,1194,1195],{},[293,1196,753],{"href":752},", or whether it will remain a domain where human judgment about priorities and consequences provides durable value, is a question the field has not yet answered.",[267,1199,271,1201,271,1203],{"className":1200},[501,502],[279,1202,505],{"id":501},[507,1204,336,1206,336,1216,336,1227,336,1236,336,1246,336,1255,336,1264,336,1274,336,1284,336,1293,336,1302,271],{"className":1205},[510,511,512,513],[515,1207,1208,1209,872,1212],{"id":517},"S. R. Javaji et al., \"Another Turn, Better Output? A Turn-Wise Analysis of Iterative LLM Prompting,\" in ",[520,1210,1211],{},"Proc. NeurIPS 2025 Workshop on Multi-Turn Interactions with LLMs",[293,1213,531],{"href":1214,"target":527,"className":1215},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2509.06770",[529,530],[515,1217,1218,1219,1222,1223],{"id":534},"N. F. Liu et al., \"Lost in the Middle: How Language Models Use Long Contexts,\" ",[520,1220,1221],{},"Transactions of the Association for Computational Linguistics",", vol. 12, pp. 157–173, 2024, ",[293,1224,531],{"href":1225,"target":527,"className":1226},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2307.03172",[529,530],[515,1228,1229,1230,872,1232],{"id":544},"L. Mei et al., \"A Survey of Context Engineering for Large Language Models,\" ",[520,1231,522],{},[293,1233,531],{"href":1234,"target":527,"className":1235},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.13334",[529,530],[515,1237,1238,1239,1241,1242],{"id":554},"J. Cook et al., \"TICKing All the Boxes: Generated Checklists Improve LLM Evaluation and Generation,\" ",[520,1240,522],{},", 2024, ",[293,1243,531],{"href":1244,"target":527,"className":1245},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.03608",[529,530],[515,1247,1248,1249,523,1251],{"id":564},"K. Zhou and C. Tan, \"AutoChecklist: Composable Pipelines for Checklist Generation and Scoring with LLM-as-a-Judge,\" ",[520,1250,522],{},[293,1252,531],{"href":1253,"target":527,"className":1254},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.07019",[529,530],[515,1256,1257,1258,523,1260],{"id":574},"A. Vasilopoulos, \"Codified Context: Infrastructure for AI Agents in a Complex Codebase,\" ",[520,1259,522],{},[293,1261,531],{"href":1262,"target":527,"className":1263},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.20478",[529,530],[515,1265,1266,1267,523,1270],{"id":584},"Q. Zhang et al., \"Agentic Context Engineering: Evolving Contexts for Self-Improving Language Models,\" in ",[520,1268,1269],{},"Proc. International Conference on Learning Representations (ICLR)",[293,1271,531],{"href":1272,"target":527,"className":1273},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.04618",[529,530],[515,1275,1276,1277,523,1280],{"id":919},"M. Suzgun et al., \"Dynamic Cheatsheet: Test-Time Learning with Adaptive Memory,\" in ",[520,1278,1279],{},"Proc. European Chapter of the Association for Computational Linguistics (EACL)",[293,1281,531],{"href":1282,"target":527,"className":1283},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.07952",[529,530],[515,1285,1286,1287,523,1289],{"id":929},"H. Ye et al., \"Meta Context Engineering via Agentic Skill Evolution,\" ",[520,1288,522],{},[293,1290,531],{"href":1291,"target":527,"className":1292},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.21557",[529,530],[515,1294,1295,1296,872,1298],{"id":939},"B. Callewaert, S. Vandevelde, and J. Vennekens, \"VERUS-LM: A Versatile Framework for Combining LLMs with Symbolic Reasoning,\" ",[520,1297,522],{},[293,1299,531],{"href":1300,"target":527,"className":1301},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2501.14540",[529,530],[515,1303,1304,1305,523,1307],{"id":949},"D. Rashie and V. Rashi, \"Type-Checked Compliance: Deterministic Guardrails for Agentic Financial Systems Using Lean 4 Theorem Proving,\" ",[520,1306,522],{},[293,1308,531],{"href":1309,"target":527,"className":1310},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.01483",[529,530],{"title":594,"searchDepth":595,"depth":595,"links":1312},[1313,1314,1315,1316,1317,1318,1319],{"id":1016,"depth":595,"text":1017},{"id":1039,"depth":595,"text":1040},{"id":1056,"depth":595,"text":1057},{"id":1099,"depth":595,"text":1100},{"id":1149,"depth":595,"text":1150},{"id":1182,"depth":595,"text":1183},{"id":501,"depth":595,"text":505},"2026-05-04","Iterative refinement with language models can improve or degrade output depending on what happens inside each turn. Structured checklists, selective memory, and deterministic validation are three mechanisms that determine whether successive passes build quality or erode it.",{"src":1323},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1759148414274-c8df3fb77f8c?w=800&auto=format&fit=crop",{"authors":1325,"badge":1328,"source":1329},[1326],{"avatar":1327,"name":613,"to":619},{"src":612},{"label":616},{"name":618,"url":619},{"title":234,"description":1321},"4ivaNfcPk1nk_mVk9hZ7-G2llRUC36tffLkhX5QUrpM",1778947327125]