[{"data":1,"prerenderedAt":959},["ShallowReactive",2],{"navigation":3,"\u002Fnews\u002Finsights\u002Fagent-capability-reliability-split":249,"\u002Fnews\u002Finsights\u002Fagent-capability-reliability-split-surround":620},[4,8,17,21,25,29,33,37,237,241,245],{"title":5,"path":6,"stem":7},"About Thinkata Intelligence","\u002Fabout","about",{"title":9,"path":10,"stem":11,"children":12},"Authentication","\u002Fauth","auth",[13],{"title":14,"path":15,"stem":16},"Email Confirmation","\u002Fauth\u002Fconfirmation","auth\u002Fconfirmation",{"title":18,"path":19,"stem":20},"Case Studies","\u002Fcase-studies","case-studies",{"title":22,"path":23,"stem":24},"Contact Us","\u002Fcontact","contact",{"title":26,"path":27,"stem":28},"Thinkata - Advanced AI Engineering & Multi-Agent System Solutions","\u002F","index",{"title":30,"path":31,"stem":32},"Insights","\u002Finsights","insights",{"title":34,"path":35,"stem":36},"Leadership","\u002Fleadership","leadership",{"title":38,"path":39,"stem":40,"children":41},"News","\u002Fnews","news",[42,45,69],{"title":43,"path":39,"stem":44},"News & Insights","news\u002Findex",{"title":18,"path":46,"stem":47,"children":48},"\u002Fnews\u002Fcase-studies","news\u002Fcase-studies",[49,53,57,61,65],{"title":50,"path":51,"stem":52},"Building Secure and Scalable AI Infrastructure: Integrating with Existing Systems through Modern Cloud Frameworks","\u002Fnews\u002Fcase-studies\u002Fcloud-infrastructure-ai","news\u002Fcase-studies\u002Fcloud-infrastructure-ai",{"title":54,"path":55,"stem":56},"Making Sense of Financial Regulations: How AI Teams Can Tackle Complex Documents","\u002Fnews\u002Fcase-studies\u002Ffinancial-regulations","news\u002Fcase-studies\u002Ffinancial-regulations",{"title":58,"path":59,"stem":60},"AI-Powered Transformations in Healthcare","\u002Fnews\u002Fcase-studies\u002Fhealth-care","news\u002Fcase-studies\u002Fhealth-care",{"title":62,"path":63,"stem":64},"Generative AI in Upstream Natural Gas: Shell's Exploration Initiative","\u002Fnews\u002Fcase-studies\u002Foil-gas","news\u002Fcase-studies\u002Foil-gas",{"title":66,"path":67,"stem":68},"Optimizing Manufacturing with AI-Driven Multi-Agent Systems","\u002Fnews\u002Fcase-studies\u002Fsupply-chain-optimization","news\u002Fcase-studies\u002Fsupply-chain-optimization",{"title":30,"path":70,"stem":71,"children":72},"\u002Fnews\u002Finsights","news\u002Finsights",[73,77,81,85,89,93,97,101,105,109,113,117,121,125,129,133,137,141,145,149,153,157,161,165,169,173,177,181,185,189,193,197,201,205,209,213,217,221,225,229,233],{"title":74,"path":75,"stem":76},"The Capability-Reliability Split in Agent Systems","\u002Fnews\u002Finsights\u002Fagent-capability-reliability-split","news\u002Finsights\u002Fagent-capability-reliability-split",{"title":78,"path":79,"stem":80},"The Rise of AI Agents in Cyberattacks: Latest Research and Threats","\u002Fnews\u002Finsights\u002Fai-agent-cyber-threats","news\u002Finsights\u002Fai-agent-cyber-threats",{"title":82,"path":83,"stem":84},"The Smart Enterprise AI Stack: Why Teams of AI Agents Beat Solo Models Consistently","\u002Fnews\u002Finsights\u002Fai-architecture","news\u002Finsights\u002Fai-architecture",{"title":86,"path":87,"stem":88},"When Seeing Everything Becomes the Only Option","\u002Fnews\u002Finsights\u002Fai-comprehensive-observability","news\u002Finsights\u002Fai-comprehensive-observability",{"title":90,"path":91,"stem":92},"The Data Infrastructure AI-Native Systems Can't Ignore","\u002Fnews\u002Finsights\u002Fai-data-layer","news\u002Finsights\u002Fai-data-layer",{"title":94,"path":95,"stem":96},"Enterprise AI Triage Systems: Intelligent Automation for Large-Scale Operations","\u002Fnews\u002Finsights\u002Fai-enterprise-triage","news\u002Finsights\u002Fai-enterprise-triage",{"title":98,"path":99,"stem":100},"When Oversight Becomes Infrastructure","\u002Fnews\u002Finsights\u002Fai-governed-autonomy","news\u002Finsights\u002Fai-governed-autonomy",{"title":102,"path":103,"stem":104},"Designing for Graceful Failure in Compound AI Systems","\u002Fnews\u002Finsights\u002Fai-graceful-failure","news\u002Finsights\u002Fai-graceful-failure",{"title":106,"path":107,"stem":108},"Intelligent Composability: Building AI Systems Like Orchestra, Not Soloists","\u002Fnews\u002Finsights\u002Fai-intelligent-composability","news\u002Finsights\u002Fai-intelligent-composability",{"title":110,"path":111,"stem":112},"Building the Plane While Flying It — Migrating from Monolith to AI-Native Without Stopping","\u002Fnews\u002Finsights\u002Fai-migration-path","news\u002Finsights\u002Fai-migration-path",{"title":114,"path":115,"stem":116},"Stability Through Continuous Adaptation","\u002Fnews\u002Finsights\u002Fai-native-overview","news\u002Finsights\u002Fai-native-overview",{"title":118,"path":119,"stem":120},"Provable Stability: Mathematical Guarantees for Adaptive AI Systems","\u002Fnews\u002Finsights\u002Fai-provable-stability","news\u002Finsights\u002Fai-provable-stability",{"title":122,"path":123,"stem":124},"How Temperature Tuning Makes or Breaks Reinforcement Learning","\u002Fnews\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse","news\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse",{"title":126,"path":127,"stem":128},"Testing What Can't Be Predicted","\u002Fnews\u002Finsights\u002Fai-systems-testing","news\u002Finsights\u002Fai-systems-testing",{"title":130,"path":131,"stem":132},"Closing the Loop: How Human Corrections Can Make AI Systems Smarter Over Time","\u002Fnews\u002Finsights\u002Fclosing-the-loop","news\u002Finsights\u002Fclosing-the-loop",{"title":134,"path":135,"stem":136},"Multi-Path Reasoning: Collaborative and Competitive Approaches in AI","\u002Fnews\u002Finsights\u002Fcollaborative-competitive-agents","news\u002Finsights\u002Fcollaborative-competitive-agents",{"title":138,"path":139,"stem":140},"Why Challenges Supercharge Smarts for Humans and AI","\u002Fnews\u002Finsights\u002Fcompetition-improves-ai","news\u002Finsights\u002Fcompetition-improves-ai",{"title":142,"path":143,"stem":144},"Context is the New Code","\u002Fnews\u002Finsights\u002Fcontext-is-new-code","news\u002Finsights\u002Fcontext-is-new-code",{"title":146,"path":147,"stem":148},"Continuous Thought Machines","\u002Fnews\u002Finsights\u002Fcontinuous-thought-machines","news\u002Finsights\u002Fcontinuous-thought-machines",{"title":150,"path":151,"stem":152},"Don't Vibe, Architect","\u002Fnews\u002Finsights\u002Fdont-vibe-architect","news\u002Finsights\u002Fdont-vibe-architect",{"title":154,"path":155,"stem":156},"The Edge of the Underdefined","\u002Fnews\u002Finsights\u002Fedge-of-the-underdefined","news\u002Finsights\u002Fedge-of-the-underdefined",{"title":158,"path":159,"stem":160},"A Multi-Tier Safety Architecture for Critical Applications","\u002Fnews\u002Finsights\u002Ffour-tier-architecture","news\u002Finsights\u002Ffour-tier-architecture",{"title":162,"path":163,"stem":164},"Hybrid Autoregressive Residual Tokens","\u002Fnews\u002Finsights\u002Fhart-model","news\u002Finsights\u002Fhart-model",{"title":166,"path":167,"stem":168},"Hierarchical Reasoning in Artificial Intelligence","\u002Fnews\u002Finsights\u002Fhierarchical-approaches","news\u002Finsights\u002Fhierarchical-approaches",{"title":170,"path":171,"stem":172},"Latent Diffusion for Language Generation: A Comprehensive Overview","\u002Fnews\u002Finsights\u002Flatent-diffusion-for-language","news\u002Finsights\u002Flatent-diffusion-for-language",{"title":174,"path":175,"stem":176},"Breaking Language Barriers: How AI Can Translate Without Examples","\u002Fnews\u002Finsights\u002Flearning-languages","news\u002Finsights\u002Flearning-languages",{"title":178,"path":179,"stem":180},"The Emergence of AI Deception: How Large Language Models Have Learned to Strategically Mislead Users","\u002Fnews\u002Finsights\u002Fllm-deception","news\u002Finsights\u002Fllm-deception",{"title":182,"path":183,"stem":184},"Synergizing Specialized Reasoning and General Capabilities in AI","\u002Fnews\u002Finsights\u002Fllm-reasoning-advances","news\u002Finsights\u002Fllm-reasoning-advances",{"title":186,"path":187,"stem":188},"The AI That Rewrites Itself: MIT's Breakthrough in Self-Adapting Language Models","\u002Fnews\u002Finsights\u002Fllm-seal","news\u002Finsights\u002Fllm-seal",{"title":190,"path":191,"stem":192},"Metacognitive Reinforcement Learning for Self-Improving AI Systems","\u002Fnews\u002Finsights\u002Fmetacognitive-reinforcement-learning","news\u002Finsights\u002Fmetacognitive-reinforcement-learning",{"title":194,"path":195,"stem":196},"Revolutionary Advancements in Mixture of Experts (MoE) Architectures","\u002Fnews\u002Finsights\u002Fmixture-of-experts","news\u002Finsights\u002Fmixture-of-experts",{"title":198,"path":199,"stem":200},"Balancing Neural Plasticity and Stability","\u002Fnews\u002Finsights\u002Fneural-plasticity","news\u002Finsights\u002Fneural-plasticity",{"title":202,"path":203,"stem":204},"Offline RL and the Data Flywheel","\u002Fnews\u002Finsights\u002Foffline-rl-data-flywheel","news\u002Finsights\u002Foffline-rl-data-flywheel",{"title":206,"path":207,"stem":208},"Reward Design as Architecture","\u002Fnews\u002Finsights\u002Freward-design-as-architecture","news\u002Finsights\u002Freward-design-as-architecture",{"title":210,"path":211,"stem":212},"When Success Has No Author: The Temporal Credit Assignment Problem","\u002Fnews\u002Finsights\u002Frl-credit-assignment-problem","news\u002Finsights\u002Frl-credit-assignment-problem",{"title":214,"path":215,"stem":216},"Beyond Entropy Collapse: When Exploration Succeeds but Learning Fails","\u002Fnews\u002Finsights\u002Frl-optimization-gaps","news\u002Finsights\u002Frl-optimization-gaps",{"title":218,"path":219,"stem":220},"The Path to Practical Confidential Computing for AI Systems","\u002Fnews\u002Finsights\u002Fsecure-ai-architectures","news\u002Finsights\u002Fsecure-ai-architectures",{"title":222,"path":223,"stem":224},"Spiking Neural Networks for Energy-Efficient AI","\u002Fnews\u002Finsights\u002Fspiking-neural-networks","news\u002Finsights\u002Fspiking-neural-networks",{"title":226,"path":227,"stem":228},"AI Speech Translation: Breaking Down Language Barriers","\u002Fnews\u002Finsights\u002Fsts-performance-advances","news\u002Finsights\u002Fsts-performance-advances",{"title":230,"path":231,"stem":232},"Test-Time Training Layers: The Next Evolution in Transformer Architecture","\u002Fnews\u002Finsights\u002Ftest-time-training-layers","news\u002Finsights\u002Ftest-time-training-layers",{"title":234,"path":235,"stem":236},"Breakthrough: Large Language Models Pass the Turing Test","\u002Fnews\u002Finsights\u002Fturing-tests","news\u002Finsights\u002Fturing-tests",{"title":238,"path":239,"stem":240},"Privacy Policy","\u002Fprivacy","privacy",{"title":242,"path":243,"stem":244},"Research","\u002Fresearch","research",{"title":246,"path":247,"stem":248},"Terms of Service","\u002Fterms","terms",{"id":250,"title":74,"body":251,"date":601,"description":602,"extension":603,"image":604,"meta":606,"navigation":617,"path":75,"seo":618,"stem":76,"__hash__":619},"insights\u002Fnews\u002Finsights\u002Fagent-capability-reliability-split.md",{"type":252,"value":253,"toc":581},"minimark",[254,273,277,280,284,296,301,311,315,325,328,332,342,368,377,381,391,399,403,420,424,434,438,446,450,453,456],[255,256,259,260,259,266],"div",{"className":257},[258],"page-title","\n  ",[261,262,74],"h1",{"className":263,"id":265},[264],"page-title__main","the-capability-reliability-split-in-agent-systems",[267,268,272],"h2",{"className":269,"id":271},[270],"page-title__sub","why-frontier-agents-reach-state-of-the-art-on-one-run-and-fail-at-the-same-task-on-the-next","Why frontier agents reach state-of-the-art on one run, and fail at the same task on the next",[274,275,276],"p",{},"A frontier agent can occasionally surpass a published research baseline and, in another run on the same task, fail to make any meaningful progress. The pattern recurs often enough across recent evaluations that researchers have started to treat it as a structural feature of agent systems rather than a quirk of any single implementation. Capability asks whether a model can perform a task in principle. Reliability asks whether it does so consistently, across repeated attempts, across small perturbations, and across tasks that take dozens or hundreds of steps to complete. Recent evidence suggests these two properties drift apart faster than benchmark headlines make visible.",[274,278,279],{},"The split has practical stakes. An agent system, in this context, refers to a large language model (LLM, the underlying neural network that processes text) coupled with a scaffold (the surrounding software that decides when to call the model, what tools to invoke, and how to handle errors). When the same agent passes a benchmark on Monday and breaks on a near-identical task on Tuesday, the deployment question is no longer whether the technology can do the work. The question becomes how often it does.",[267,281,283],{"id":282},"when-the-same-agent-both-wins-and-fails","When the Same Agent Both Wins and Fails",[274,285,286,287,295],{},"ResearchGym, a benchmark that places agents inside containerized research environments rebuilt from accepted papers at ICML, ICLR, and ACL, captures the split with unusual clarity. In a controlled evaluation of an agent powered by GPT-5, the system improved over the provided baselines in only 1 of 15 evaluations, an improvement rate of 6.7%, and completed only 26.5% of sub-tasks on average across 39 sub-tasks total ",[288,289,290],"sup",{},[291,292,294],"a",{"href":293},"#source-1","[1]",". In a single run, the same agent surpassed the solution from an ICML 2025 Spotlight paper, evidence that the underlying capability is real even when the reliability is not. Proprietary scaffolds built on Claude Code (Opus-4.5) and Codex (GPT-5.2) displayed a similar gap.",[297,298,300],"h3",{"id":299},"across-long-horizons","Across Long Horizons",[274,302,303,304,310],{},"HORIZON, a cross-domain diagnostic benchmark released in April 2026, looked at the same problem from a different angle. Across more than 3,100 trajectories collected from frontier models in the GPT-5 and Claude families, the authors documented a horizon-dependent degradation pattern. Agents that performed strongly on short tasks broke down on long-horizon work that required extended, interdependent action sequences ",[288,305,306],{},[291,307,309],{"href":308},"#source-2","[2]",".",[297,312,314],{"id":313},"across-many-models","Across Many Models",[274,316,317,318,324],{},"The Holistic Agent Leaderboard (HAL), introduced by a group at Princeton, ran 21,730 agent rollouts spanning 9 models, 9 benchmarks, and four domains, comparing models, scaffolds, and benchmarks side by side and bringing the cost of large-scale agent evaluation down by roughly an order of magnitude ",[288,319,320],{},[291,321,323],{"href":322},"#source-3","[3]",". One counterintuitive finding from that data is worth pausing on. Higher reasoning effort, the practice of allocating more inference-time compute to deliberation, reduced accuracy in the majority of runs.",[274,326,327],{},"A move that should obviously help did not. Bigger headline numbers and steadier behavior are not the same thing, even when the same lever is being pulled.",[267,329,331],{"id":330},"why-standard-benchmarks-miss-the-gap","Why Standard Benchmarks Miss the Gap",[274,333,334,335,341],{},"Part of the reliability story is methodological. Most agent evaluations report pass@1, the probability that an agent succeeds on a single attempt. A 2026 study collected 60,000 agentic trajectories on SWE-Bench-Verified, a software engineering benchmark, across three models and two scaffolds, and found that single-run pass@1 estimates vary by 2.2 to 6.0 percentage points depending on which run is selected, with standard deviations exceeding 1.5 percentage points even at temperature 0, the setting that should produce the most deterministic behavior ",[288,336,337],{},[291,338,340],{"href":339},"#source-4","[4]",". Reported improvements of 2 to 3 percentage points, the kind that often headline a new release, may reflect evaluation noise rather than genuine progress. Trajectories diverged early, often within the first few percent of generated tokens (a token is the unit of text the model processes, roughly a word or word fragment), and these small differences cascaded into entirely different solution strategies.",[255,343,345,346,345,352,345,357],{"style":344},"width: 100%; margin: 20px 0;","\n    ",[347,348],"img",{"src":349,"alt":350,"style":351},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1540962351504-03099e0a754b?w=800&auto=format&fit=crop","Aircraft cockpit instrument panel with multiple separate gauges and dials, analogous to how a reliability science framework breaks agent performance into independent measurements rather than a single accuracy score","width: 100%; height: auto;",[297,353,356],{"style":354,"id":355},"margin: 1rem 0 0.5rem 0;","a-reliability-science-for-agents","A Reliability Science for Agents",[274,358,360,361,367],{"style":359},"margin: 0;","Just as a cockpit instrument panel separates altitude, airspeed, and fuel into independent gauges so a pilot can see when one is failing, a reliability science framework released in March 2026 splits agent performance into separate dimensions tracked over time. The authors evaluated 10 models across 23,392 episodes on a 396-task benchmark that varied task duration and domain, and proposed four metrics including a Reliability Decay Curve, which tracks how success rate falls as tasks lengthen, and a Variance Amplification Factor, which measures how variability in outcomes grows with horizon ",[288,362,363],{},[291,364,366],{"href":365},"#source-5","[5]",". Capability and reliability rankings diverged substantially, with multi-rank inversions at long horizons. A model ranked first on short tasks could fall to fourth or fifth once tasks stretched out. Frontier models showed the highest meltdown rates, up to 19%, because they attempted ambitious multi-step strategies that sometimes spiraled into failure.",[274,369,370,371,310],{},"A March 2025 survey of agent evaluation methods, updated through 2026, identified the same pattern at a higher level. Cost-efficiency, safety, and robustness remain underassessed in most agent benchmarks ",[288,372,373],{},[291,374,376],{"href":375},"#source-6","[6]",[267,378,380],{"id":379},"the-mechanics-of-long-horizon-failure","The Mechanics of Long-Horizon Failure",[274,382,383,384,390],{},"The next question is mechanical. What is actually breaking when an agent that performs well on short tasks falls apart on long ones? A January 2026 analysis frames the answer as a mismatch between reasoning and planning. Step-wise reasoning, the chain-of-thought pattern that has driven much of the recent progress in LLMs, induces what the authors call a step-wise greedy policy ",[288,385,386],{},[291,387,389],{"href":388},"#source-7","[7]",". The agent picks the locally best next action without modeling delayed consequences. Over short horizons this often suffices. Over long horizons, early myopic commitments compound and become difficult to recover from. The proposed fix, FLARE (Future-aware Lookahead with Reward Estimation), pushes value propagation back through the trajectory so that downstream outcomes can shape early decisions. Across multiple benchmarks, FLARE often allowed a smaller open-source model to outperform a larger frontier model running standard step-by-step reasoning. The argument draws a clearer line between reasoning, the local manipulation of intermediate steps, and planning, the explicit consideration of how early choices constrain later ones.",[274,392,393,394,398],{},"ResearchGym catalogs the same phenomenon from the failure side. Across runs, the recurring problems were impatience, poor time and resource management, overconfidence in weak hypotheses, difficulty coordinating parallel experiments, and hard limits imposed by context length, the maximum number of tokens an LLM can consider at once ",[288,395,396],{},[291,397,294],{"href":293},". None of these are pure capability failures. An agent that knows what a good experiment looks like can still abandon it too early, commit to the wrong hypothesis with too much confidence, or simply run out of working memory before the task ends. The capabilities the model has in isolation do not translate cleanly into behavior under sustained pressure.",[267,400,402],{"id":401},"what-helps-and-what-surprisingly-does-not","What Helps, and What Surprisingly Does Not",[274,404,405,406,412,413,419],{},"Mitigation research has clustered around test-time scaling, the practice of allocating more compute at inference time to improve outcomes without retraining. The first systematic study of test-time scaling for language agents, published in mid-2025, found that scaling helps, that knowing when to reflect matters, that list-wise verification methods, which compare a list of candidates rather than ranking them pairwise, outperform alternatives, and that diversifying rollouts has a positive effect on task performance ",[288,407,408],{},[291,409,411],{"href":410},"#source-8","[8]",". A 2026 framework called ARTIS extended these ideas to settings where actions touch external systems and cannot be undone, by decoupling exploration from commitment through simulated interactions before real-world execution ",[288,414,415],{},[291,416,418],{"href":417},"#source-9","[9]",". The authors flag a less obvious finding. Naive LLM-based simulators struggle to capture rare but high-impact failure modes, which means simulators have to be deliberately trained to be honest about how things go wrong, not only how they go right.",[297,421,423],{"id":422},"what-helps","What Helps",[274,425,426,427,433],{},"For long-horizon coding agents specifically, a 2026 study argued that test-time scaling is fundamentally a problem of representation, selection, and reuse rather than generating more attempts ",[288,428,429],{},[291,430,432],{"href":431},"#source-10","[10]",". By converting each rollout into a structured summary of hypotheses, progress, and failure modes, then using methods like Recursive Tournament Voting and Parallel-Distill-Refine to select among candidates, the authors moved Claude-4.5-Opus from 70.9% to 77.6% on SWE-Bench Verified and from 46.9% to 59.1% on Terminal-Bench v2.0.",[297,435,437],{"id":436},"what-hurts","What Hurts",[274,439,440,441,445],{},"The same reliability framework that documented divergence between capability and reliability also reported a counterintuitive negative result. Across all 10 models tested, memory scaffolds, the systems designed to give agents persistent context across turns, universally hurt long-horizon performance ",[288,442,443],{},[291,444,366],{"href":365},". The default assumption that more memory is always better appears to be wrong in this regime, at least for the scaffolds and tasks studied. The HAL finding that higher reasoning effort can reduce accuracy points in a similar direction. More of a thing is not always more useful.",[267,447,449],{"id":448},"what-this-might-mean","What This Might Mean",[274,451,452],{},"The picture that emerges, while still incomplete, points toward a few useful adjustments rather than a single fix. The field appears to be moving toward treating reliability as a first-class evaluation dimension rather than a footnote to capability. Multi-run pass@1, statistical power analysis, and pessimistic bounds like pass^k are entering the conversation precisely because the cost of mistaking noise for progress is now visible. The design assumption that more compute, more memory, or more reasoning effort always helps is being tested empirically and sometimes failing. The gap between \"the agent did this once\" and \"the agent does this when it matters\" remains the gap that separates impressive demos from production deployments.",[274,454,455],{},"For organizations evaluating agent systems, the implication is straightforward enough to state without overstatement. A single high score on a benchmark suggests what the system can sometimes do. It does not, on its own, describe what the system will do under repetition, perturbation, or duration. The evidence from late 2025 and early 2026 suggests treating these as different questions, and budgeting evaluation accordingly. One open question is whether the next generation of agent improvements will close the split or widen it.",[255,457,259,461,259,464],{"className":458},[459,460],"references","mt-8",[267,462,463],{"id":459},"References",[465,466,345,472,345,490,345,500,345,511,345,521,345,531,345,541,345,551,345,561,345,571,259],"ol",{"className":467},[468,469,470,471],"list-decimal","list-inside","space-y-2","mt-4",[473,474,476,477,481,482],"li",{"id":475},"source-1","A. Garikaparthi et al., \"ResearchGym: Evaluating Language Model Agents on Real-World AI Research,\" ",[478,479,480],"em",{},"arXiv",", 2026, ",[291,483,489],{"href":484,"target":485,"className":486},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.15112","_blank",[487,488],"text-blue-600","underline","[Online]",[473,491,493,494,481,496],{"id":492},"source-2","X. J. Wang et al., \"The Long-Horizon Task Mirage? Diagnosing Where and Why Agentic Systems Break,\" ",[478,495,480],{},[291,497,489],{"href":498,"target":485,"className":499},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.11978",[487,488],[473,501,503,504,506,507],{"id":502},"source-3","S. Kapoor et al., \"Holistic Agent Leaderboard: The Missing Infrastructure for AI Agent Evaluation,\" ",[478,505,480],{},", 2025, ",[291,508,489],{"href":509,"target":485,"className":510},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.11977",[487,488],[473,512,514,515,481,517],{"id":513},"source-4","B. Bjarnason et al., \"On Randomness in Agentic Evals,\" ",[478,516,480],{},[291,518,489],{"href":519,"target":485,"className":520},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.07150",[487,488],[473,522,524,525,481,527],{"id":523},"source-5","A. Khanal et al., \"Beyond pass@1: A Reliability Science Framework for Long-Horizon LLM Agents,\" ",[478,526,480],{},[291,528,489],{"href":529,"target":485,"className":530},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2603.29231",[487,488],[473,532,534,535,506,537],{"id":533},"source-6","A. Yehudai et al., \"Survey on Evaluation of LLM-based Agents,\" ",[478,536,480],{},[291,538,489],{"href":539,"target":485,"className":540},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2503.16416",[487,488],[473,542,544,545,481,547],{"id":543},"source-7","Z. Wang et al., \"Why Reasoning Fails to Plan: A Planning-Centric Analysis of Long-Horizon Decision Making in LLM Agents,\" ",[478,546,480],{},[291,548,489],{"href":549,"target":485,"className":550},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2601.22311",[487,488],[473,552,554,555,506,557],{"id":553},"source-8","K. Zhu et al., \"Scaling Test-time Compute for LLM Agents,\" ",[478,556,480],{},[291,558,489],{"href":559,"target":485,"className":560},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2506.12928",[487,488],[473,562,564,565,481,567],{"id":563},"source-9","X. Zeng et al., \"ARTIS: Agentic Risk-Aware Test-Time Scaling via Iterative Simulation,\" ",[478,566,480],{},[291,568,489],{"href":569,"target":485,"className":570},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2602.01709",[487,488],[473,572,574,575,481,577],{"id":573},"source-10","J. Kim et al., \"Scaling Test-Time Compute for Agentic Coding,\" ",[478,576,480],{},[291,578,489],{"href":579,"target":485,"className":580},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.16529",[487,488],{"title":582,"searchDepth":583,"depth":583,"links":584},"",2,[585,586,591,594,595,599,600],{"id":271,"depth":583,"text":272},{"id":282,"depth":583,"text":283,"children":587},[588,590],{"id":299,"depth":589,"text":300},3,{"id":313,"depth":589,"text":314},{"id":330,"depth":583,"text":331,"children":592},[593],{"id":355,"depth":589,"text":356},{"id":379,"depth":583,"text":380},{"id":401,"depth":583,"text":402,"children":596},[597,598],{"id":422,"depth":589,"text":423},{"id":436,"depth":589,"text":437},{"id":448,"depth":583,"text":449},{"id":459,"depth":583,"text":463},"2026-04-25","Why frontier agents reach state-of-the-art on one run and fail at the same task on the next, and what evaluation needs to change.","md",{"src":605},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1518770660439-4636190af475?w=800&auto=format&fit=crop",{"authors":607,"badge":613,"source":615},[608],{"avatar":609,"name":611,"to":612},{"src":610},"\u002Fimg\u002Fmark_avatar.png","Mark Williams","https:\u002F\u002Fthinkata.com",{"label":614},"AI Engineering",{"name":616,"url":612},"Thinkata Research",true,{"title":74,"description":602},"0a5e4zWiRLVunzuNxOgmhm3xM6u7S8XiD0kTkSLs9Go",[621,622],null,{"id":623,"title":154,"body":624,"date":947,"description":948,"extension":603,"image":949,"meta":951,"navigation":617,"path":155,"seo":957,"stem":156,"__hash__":958,"_path":155},"insights\u002Fnews\u002Finsights\u002Fedge-of-the-underdefined.md",{"type":252,"value":625,"toc":939},[626,638,651,654,658,666,674,708,716,720,723,731,758,780,783,787,790,793,800,806,815,821,824,828,831,834,837],[255,627,259,629,259,633],{"className":628},[258],[261,630,154],{"className":631,"id":632},[264],"the-edge-of-the-underdefined",[267,634,637],{"className":635,"id":636},[270],"what-stays-human-when-agents-learn-to-engineer-their-own-context","What stays human when agents learn to engineer their own context",[274,639,640],{},[478,641,642,643,646,647,650],{},"This is the final article in \"The Meta-Engineer,\" a three-part series examining how AI is reshaping the identity and skill set of software engineers. The first article, ",[291,644,645],{"href":143},"\"Context is the New Code,\""," traced the rise of context engineering as a discipline. The second, ",[291,648,649],{"href":151},"\"Don't Vibe, Architect,\""," showed how professionals orchestrate agents at scale. Both ended with the same uncomfortable observation. The artifacts and skills that feel distinctly human are already beginning to be automated by the systems they were designed to guide.",[274,652,653],{},"This final article takes up the question directly. If self-improving agents can refine their own prompts, playbooks, and architectures, what remains durably human? The answer requires examining two things. First, which engineering skills are being commoditized, and which are gaining value. Second, how far the automation of meta-knowledge, knowledge about how to manage knowledge, has actually progressed. The evidence points toward a conclusion more precise than either \"everything will be automated\" or \"humans will always be needed.\"",[267,655,657],{"id":656},"which-skills-survive","Which Skills Survive",[274,659,660,661,665],{},"The analysis of 57 practitioner videos that identified the conductor metaphor in the previous article also raised a pointed concern about what happens at the entry level ",[288,662,663],{},[291,664,294],{"href":293},". Junior engineers who accept AI output without understanding it create \"house of cards\" solutions, code that compiles and passes tests but rests on foundations no one in the room actually understands. The study argued for curricular shifts toward problem-solving, architectural thinking, code review, and early integration of large language model (LLM) tools, precisely because the skills that agents handle well (syntax, boilerplate, routine implementation) are the same skills that traditionally served as the training ground for new developers. If the on-ramp disappears, the question becomes how to develop judgment without the years of hands-on experience that currently produce it.",[274,667,668,669,673],{},"A paper framing the emergence of \"SE 3.0\" documented the broader role shift from manual coding to high-level orchestration and projected that traditional IDEs (integrated development environments, the text editors and tooling that programmers use to write code) will eventually give way to agent orchestration environments ",[288,670,671],{},[291,672,309],{"href":308},". This describes tools and workflows that already exist in prototype form.",[255,675,345,677,345,694],{"style":676},"display: flex; flex-wrap: wrap; gap: 20px; margin: 20px 0;",[255,678,680,681,680,686,345],{"style":679},"flex: 1; min-width: 300px;","\n        ",[297,682,685],{"style":683,"id":684},"margin: 0 0 1rem 0;","whats-commoditizing","What's Commoditizing",[274,687,688,689,693],{"style":359},"The first direct comparison of agent and human code proficiency found that agents generate overwhelmingly basic-level code, with over 90% of Python constructs falling into beginner and elementary categories ",[288,690,691],{},[291,692,323],{"href":322},". The proficiency profiles of agent-written code and human-written code were broadly similar, with small but statistically significant differences. Agents are not writing qualitatively different code. They are writing structurally similar code faster and cheaper, which makes the commoditization of routine implementation concrete rather than theoretical.",[255,695,680,696,680,700,345],{"style":679},[297,697,699],{"style":683,"id":698},"whats-getting-more-expensive","What's Getting More Expensive",[274,701,702,703,707],{"style":359},"These gains come with real costs. Industry surveys report nearly 89% increases in computing expenses from 2023 to 2025, driven largely by generative AI adoption, with some companies already postponing AI initiatives because the business case collapsed once costs were factored in ",[288,704,705],{},[291,706,340],{"href":339},". Cost-aware engineering, the discipline of managing token budgets (tokens are the units of text that language models process, and each one costs money), model selection, and compute allocation, is emerging as a professional competency that did not exist two years ago. The cheap part is getting cheaper. The expensive part is getting more expensive.",[274,709,710,711,715],{},"An industry-academia consortium of over 30 European partners attempted to map where all of this is heading ",[288,712,713],{},[291,714,366],{"href":365},". Their five-year vision projects \"self-star\" systems (self-healing, self-optimizing software) enabled by agentic AI across all phases of the software development lifecycle, from requirements gathering through maintenance. The role of the software professional, in this projection, shifts decisively toward oversight, intent specification, and high-level design. The GENIUS project is building tools for this transition, but the transition itself is not waiting for the tools to be ready.",[267,717,719],{"id":718},"when-agents-learn-to-improve-themselves","When Agents Learn to Improve Themselves",[274,721,722],{},"The skills gaining value, architectural thinking, constraint specification, quality judgment, all involve what might be called meta-knowledge, knowledge about how to organize, evaluate, and direct other knowledge. The uncomfortable question is whether this meta-level work is itself automatable. A growing body of research suggests that it is, at least partially.",[274,724,725,726,730],{},"A comprehensive survey of self-evolving AI agents reviewed techniques spanning prompt evolution (automatically refining the instructions given to agents), memory adaptation (optimizing how agents store and retrieve information), tool creation (agents building new capabilities they were not initially given), and architecture search (automatically discovering better organizational structures for multi-agent systems) ",[288,727,728],{},[291,729,376],{"href":375},". The scope is striking. These are not narrow improvements to individual outputs. They are systematic methods for automatically enhancing every major component of an agent system through interaction data and environmental feedback.",[255,732,345,733,345,751],{"style":676},[255,734,680,736,345],{"style":735},"flex: 1; min-width: 300px; display: flex; flex-direction: column; justify-content: center;",[255,737,738,739,738,743,680],{},"\n            ",[297,740,742],{"style":683,"id":741},"the-compression-pattern","The Compression Pattern",[274,744,745,746,750],{"style":359},"Just as a caterpillar's cocoon becomes unnecessary once the butterfly can fly, layers of engineered scaffolding around an AI agent can become counterproductive when the underlying model grows capable enough. The SICA system (Self-Improving Coding Agent) demonstrated this by autonomously editing its own codebase, improving from 17% to 53% on a subset of SWE-Bench Verified, a benchmark that tests whether agents can resolve real GitHub issues ",[288,747,748],{},[291,749,389],{"href":388},". When a reasoning model was provided as a sub-component, crude reasoning scaffolds that SICA had built for itself actually hurt performance, because the model's native reasoning was better than the agent's self-designed wrapper. This recurs throughout the history of software. A layer that was necessary at one capability level becomes dead weight at the next.",[255,752,680,753,345],{"style":679},[347,754],{"src":755,"alt":756,"style":757},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1535231540604-72e8fbaf8cdb?w=800&auto=format&fit=crop","Three butterfly chrysalises at different stages of metamorphosis, from opaque green to transparent to fully emerged, illustrating how each stage of scaffolding becomes unnecessary as the organism matures","width: 100%; aspect-ratio: 1\u002F1; object-fit: cover;",[274,759,760,761,764,765,769,770,774,775,779],{},"The ACE framework, described in the ",[291,762,763],{"href":143},"first article"," of this series, treats context as an evolving playbook refined through a generate-reflect-curate cycle ",[288,766,767],{},[291,768,411],{"href":410},". Without any labeled training data, relying solely on execution feedback, ACE matched the top-ranked production-level agent on the AppWorld benchmark, a test suite that evaluates agents on realistic multi-step tasks, despite using a smaller open-source model. The configuration files that feel novel and human-crafted today are already beginning to be optimized by the systems they guide. The MASS framework (Multi-Agent System Search) went further by automating the search over both agent prompts and the topologies connecting multiple agents, treating not just what individual agents do but how they are organized as an optimization target ",[288,771,772],{},[291,773,418],{"href":417},". And the ALAS system (Autonomous Learning Agent System) demonstrated autonomous knowledge acquisition through an iterative loop that generates its own learning curriculum, retrieves information from the web, distills it into training data, fine-tunes the model, evaluates results, and revises its plan without human intervention ",[288,776,777],{},[291,778,432],{"href":431},". This is an agent that expands its own knowledge boundary through self-directed research.",[274,781,782],{},"The evidence is clear enough to state plainly. Prompt optimization, memory management, tool selection, coordination strategy, and even knowledge acquisition, every major dimension of what this series has called \"context engineering,\" is already the subject of automated improvement. The question is not whether these capabilities will be partially automated. They already are.",[267,784,786],{"id":785},"the-four-things-that-stay","The Four Things That Stay",[274,788,789],{},"The analysis across this series does not support either comfortable conclusion. Claiming that everything will be automated ignores the specific structural reasons why certain problems resist computational solutions. Claiming that humans will always be needed, as a reassurance, obscures the question of what exactly they will be needed for.",[274,791,792],{},"The more precise claim, supported by the evidence across these studies, is that four categories of work resist automation, and they resist it not because they are computationally hard but because they require external grounding that agent systems do not have access to.",[274,794,795,799],{},[796,797,798],"strong",{},"Goal formation."," What should the system do, and why does it matter? Every agent system begins with an objective that a human defined. The choice to build a distributed multiplayer game, to prioritize latency over consistency, to serve a particular user population, these are not optimization problems. They are decisions about what is worth doing, grounded in values, strategy, and institutional context that sits outside any training corpus.",[274,801,802,805],{},[796,803,804],{},"Constraint legitimacy."," Legal requirements, ethical boundaries, and business constraints come from outside the computational system. An agent can be told to comply with GDPR (the European data protection regulation), but it cannot independently determine that GDPR compliance matters, or negotiate the trade-offs between privacy protection and product functionality. These constraints originate in institutions, not in data.",[274,807,808,811,812,814],{},[796,809,810],{},"Taste and judgment."," The anti-mock instructions that appear in CLAUDE.md files, described in the ",[291,813,763],{"href":143},", offer a small but concrete example. Someone had to decide that excessive mocking constitutes bad practice for that particular project. That is a judgment call agents do not make on their own, because \"good\" is not a property of code. It is a property of the relationship between code and human intentions, and those intentions vary by context in ways that no benchmark captures.",[274,816,817,820],{},[796,818,819],{},"Accountability."," When systems fail, someone must be responsible. This is not a technical constraint but an institutional one. The question of who is accountable when an autonomous agent introduces a security vulnerability or makes an architectural decision that causes a production outage cannot be resolved computationally. It requires the kind of social contract that only humans can enter into.",[274,822,823],{},"These four categories share a common structure. They are not technical problems. They are social, institutional, and epistemic. They persist not because they are difficult to compute, but because the ground truth lives outside the system, in human values, legal frameworks, organizational priorities, and the continuous generation of new ambiguity that the real world produces faster than any system can resolve.",[267,825,827],{"id":826},"where-the-edge-moves","Where the Edge Moves",[274,829,830],{},"Every abstraction layer in the history of software has eventually been formalized and then automated. Assembly gave way to compilers. Manual memory management gave way to garbage collectors. Boilerplate gave way to frameworks. Code generation gave way to autonomous agents. And context engineering, despite feeling like a distinctly human cognitive skill right now, is already being partially automated by the systems it was designed to guide.",[274,832,833],{},"The real long-term role of the engineer has less to do with writing code or designing context than with operating at the edge of what machines still cannot define. That edge moves, and it moves fast. But it does not disappear, because the world keeps generating new ambiguity faster than systems can resolve it. The engineer of 2030 probably will not be writing CLAUDE.md files by hand. That engineer will be defining intent, negotiating constraints, and reviewing outcomes, the same things that were always the hardest part of engineering, dressed in new tools.",[274,835,836],{},"The pattern across this series suggests that humans do not simply move up the stack. They move to wherever meaning is still underdefined.",[255,838,259,840,259,842],{"className":839},[459,460],[267,841,463],{"id":459},[465,843,345,845,345,854,345,863,345,873,345,882,345,892,345,901,345,911,345,921,345,930,259],{"className":844},[468,469,470,471],[473,846,847,848,506,850],{"id":475},"H.-F. Chang et al., \"Coding With AI: From a Reflection on Industrial Practices to Future Computer Science and Software Engineering Education,\" ",[478,849,480],{},[291,851,489],{"href":852,"target":485,"className":853},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2512.23982",[487,488],[473,855,856,857,506,859],{"id":492},"H. Li et al., \"The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering,\" ",[478,858,480],{},[291,860,489],{"href":861,"target":485,"className":862},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.15003",[487,488],[473,864,865,866,481,869],{"id":502},"N. Temkulkiat et al., \"When is Generated Code Difficult to Comprehend? Assessing AI Agent Python Code Proficiency in the Wild,\" in ",[478,867,868],{},"Proc. 23rd International Conference on Mining Software Repositories (MSR '26)",[291,870,489],{"href":871,"target":485,"className":872},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.00299",[487,488],[473,874,875,876,506,878],{"id":513},"V. Acharya, \"Generative AI and the Transformation of Software Development Practices,\" ",[478,877,480],{},[291,879,489],{"href":880,"target":485,"className":881},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.10819",[487,488],[473,883,884,885,506,888],{"id":523},"R. Gröpler et al., \"The Future of Generative AI in Software Engineering: A Vision from Industry and Academia in the European GENIUS Project,\" in ",[478,886,887],{},"Proc. 2nd ACM International Conference on AI-powered Software (AIware '25)",[291,889,489],{"href":890,"target":485,"className":891},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2511.01348",[487,488],[473,893,894,895,506,897],{"id":533},"J. Fang et al., \"A Comprehensive Survey of Self-Evolving AI Agents: A New Paradigm Bridging Foundation Models and Lifelong Agentic Systems,\" ",[478,896,480],{},[291,898,489],{"href":899,"target":485,"className":900},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.07407",[487,488],[473,902,903,904,506,907],{"id":543},"M. Robeyns et al., \"A Self-Improving Coding Agent,\" in ",[478,905,906],{},"ICLR 2025 Workshop on Scaling Self-Improving Foundation Models",[291,908,489],{"href":909,"target":485,"className":910},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2504.15228",[487,488],[473,912,913,914,481,917],{"id":553},"Q. Zhang et al., \"Agentic Context Engineering: Evolving Contexts for Self-Improving Language Models,\" in ",[478,915,916],{},"Proc. International Conference on Learning Representations (ICLR)",[291,918,489],{"href":919,"target":485,"className":920},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2510.04618",[487,488],[473,922,923,924,506,926],{"id":563},"H. Zhou et al., \"Multi-Agent Design: Optimizing Agents with Better Prompts and Topologies,\" ",[478,925,480],{},[291,927,489],{"href":928,"target":485,"className":929},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.02533",[487,488],[473,931,932,933,506,935],{"id":573},"D. Atreja, \"ALAS: Autonomous Learning Agent for Self-Updating Language Models,\" ",[478,934,480],{},[291,936,489],{"href":937,"target":485,"className":938},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2508.15805",[487,488],{"title":582,"searchDepth":583,"depth":583,"links":940},[941,942,943,944,945,946],{"id":636,"depth":583,"text":637},{"id":656,"depth":583,"text":657},{"id":718,"depth":583,"text":719},{"id":785,"depth":583,"text":786},{"id":826,"depth":583,"text":827},{"id":459,"depth":583,"text":463},"2026-04-17","What stays human when agents learn to engineer their own context, and why the answer has less to do with abstraction level than with the nature of the work itself.",{"src":950},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1451187580459-43490279c0fa?w=800&auto=format&fit=crop",{"authors":952,"badge":955,"source":956},[953],{"avatar":954,"name":611,"to":612},{"src":610},{"label":614},{"name":616,"url":612},{"title":154,"description":948},"FkFpLPfa7K2eq0xbtFoz39XpXJZ1v0OQJ0a3JnOR-zA",1777212503314]