[{"data":1,"prerenderedAt":855},["ShallowReactive",2],{"navigation":3,"\u002Fnews\u002Finsights\u002Fexperts-all-the-way":281,"\u002Fnews\u002Finsights\u002Fexperts-all-the-way-surround":619},[4,8,17,21,25,29,33,37,269,273,277],{"title":5,"path":6,"stem":7},"About Thinkata Intelligence","\u002Fabout","about",{"title":9,"path":10,"stem":11,"children":12},"Authentication","\u002Fauth","auth",[13],{"title":14,"path":15,"stem":16},"Email Confirmation","\u002Fauth\u002Fconfirmation","auth\u002Fconfirmation",{"title":18,"path":19,"stem":20},"Case Studies","\u002Fcase-studies","case-studies",{"title":22,"path":23,"stem":24},"Contact Us","\u002Fcontact","contact",{"title":26,"path":27,"stem":28},"Thinkata - Advanced AI Engineering & Multi-Agent System Solutions","\u002F","index",{"title":30,"path":31,"stem":32},"Insights","\u002Finsights","insights",{"title":34,"path":35,"stem":36},"Leadership","\u002Fleadership","leadership",{"title":38,"path":39,"stem":40,"children":41},"News","\u002Fnews","news",[42,45,69],{"title":43,"path":39,"stem":44},"News & Insights","news\u002Findex",{"title":18,"path":46,"stem":47,"children":48},"\u002Fnews\u002Fcase-studies","news\u002Fcase-studies",[49,53,57,61,65],{"title":50,"path":51,"stem":52},"Building Secure and Scalable AI Infrastructure: Integrating with Existing Systems through Modern Cloud Frameworks","\u002Fnews\u002Fcase-studies\u002Fcloud-infrastructure-ai","news\u002Fcase-studies\u002Fcloud-infrastructure-ai",{"title":54,"path":55,"stem":56},"Making Sense of Financial Regulations: How AI Teams Can Tackle Complex Documents","\u002Fnews\u002Fcase-studies\u002Ffinancial-regulations","news\u002Fcase-studies\u002Ffinancial-regulations",{"title":58,"path":59,"stem":60},"AI-Powered Transformations in Healthcare","\u002Fnews\u002Fcase-studies\u002Fhealth-care","news\u002Fcase-studies\u002Fhealth-care",{"title":62,"path":63,"stem":64},"Generative AI in Upstream Natural Gas: Shell's Exploration Initiative","\u002Fnews\u002Fcase-studies\u002Foil-gas","news\u002Fcase-studies\u002Foil-gas",{"title":66,"path":67,"stem":68},"Optimizing Manufacturing with AI-Driven Multi-Agent Systems","\u002Fnews\u002Fcase-studies\u002Fsupply-chain-optimization","news\u002Fcase-studies\u002Fsupply-chain-optimization",{"title":30,"path":70,"stem":71,"children":72},"\u002Fnews\u002Finsights","news\u002Finsights",[73,77,81,85,89,93,97,101,105,109,113,117,121,125,129,133,137,141,145,149,153,157,161,165,169,173,177,181,185,189,193,197,201,205,209,213,217,221,225,229,233,237,241,245,249,253,257,261,265],{"title":74,"path":75,"stem":76},"The Capability-Reliability Split in Agent Systems","\u002Fnews\u002Finsights\u002Fagent-capability-reliability-split","news\u002Finsights\u002Fagent-capability-reliability-split",{"title":78,"path":79,"stem":80},"The Rise of AI Agents in Cyberattacks: Latest Research and Threats","\u002Fnews\u002Finsights\u002Fai-agent-cyber-threats","news\u002Finsights\u002Fai-agent-cyber-threats",{"title":82,"path":83,"stem":84},"The Smart Enterprise AI Stack: Why Teams of AI Agents Beat Solo Models Consistently","\u002Fnews\u002Finsights\u002Fai-architecture","news\u002Finsights\u002Fai-architecture",{"title":86,"path":87,"stem":88},"When Seeing Everything Becomes the Only Option","\u002Fnews\u002Finsights\u002Fai-comprehensive-observability","news\u002Finsights\u002Fai-comprehensive-observability",{"title":90,"path":91,"stem":92},"The Data Infrastructure AI-Native Systems Can't Ignore","\u002Fnews\u002Finsights\u002Fai-data-layer","news\u002Finsights\u002Fai-data-layer",{"title":94,"path":95,"stem":96},"Enterprise AI Triage Systems: Intelligent Automation for Large-Scale Operations","\u002Fnews\u002Finsights\u002Fai-enterprise-triage","news\u002Finsights\u002Fai-enterprise-triage",{"title":98,"path":99,"stem":100},"When Oversight Becomes Infrastructure","\u002Fnews\u002Finsights\u002Fai-governed-autonomy","news\u002Finsights\u002Fai-governed-autonomy",{"title":102,"path":103,"stem":104},"Designing for Graceful Failure in Compound AI Systems","\u002Fnews\u002Finsights\u002Fai-graceful-failure","news\u002Finsights\u002Fai-graceful-failure",{"title":106,"path":107,"stem":108},"Intelligent Composability: Building AI Systems Like Orchestra, Not Soloists","\u002Fnews\u002Finsights\u002Fai-intelligent-composability","news\u002Finsights\u002Fai-intelligent-composability",{"title":110,"path":111,"stem":112},"Building the Plane While Flying It — Migrating from Monolith to AI-Native Without Stopping","\u002Fnews\u002Finsights\u002Fai-migration-path","news\u002Finsights\u002Fai-migration-path",{"title":114,"path":115,"stem":116},"Stability Through Continuous Adaptation","\u002Fnews\u002Finsights\u002Fai-native-overview","news\u002Finsights\u002Fai-native-overview",{"title":118,"path":119,"stem":120},"Provable Stability: Mathematical Guarantees for Adaptive AI Systems","\u002Fnews\u002Finsights\u002Fai-provable-stability","news\u002Finsights\u002Fai-provable-stability",{"title":122,"path":123,"stem":124},"How Temperature Tuning Makes or Breaks Reinforcement Learning","\u002Fnews\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse","news\u002Finsights\u002Fai-soft-actor-critic-entropy-collapse",{"title":126,"path":127,"stem":128},"Testing What Can't Be Predicted","\u002Fnews\u002Finsights\u002Fai-systems-testing","news\u002Finsights\u002Fai-systems-testing",{"title":130,"path":131,"stem":132},"Closing the Loop: How Human Corrections Can Make AI Systems Smarter Over Time","\u002Fnews\u002Finsights\u002Fclosing-the-loop","news\u002Finsights\u002Fclosing-the-loop",{"title":134,"path":135,"stem":136},"Multi-Path Reasoning: Collaborative and Competitive Approaches in AI","\u002Fnews\u002Finsights\u002Fcollaborative-competitive-agents","news\u002Finsights\u002Fcollaborative-competitive-agents",{"title":138,"path":139,"stem":140},"Why Challenges Supercharge Smarts for Humans and AI","\u002Fnews\u002Finsights\u002Fcompetition-improves-ai","news\u002Finsights\u002Fcompetition-improves-ai",{"title":142,"path":143,"stem":144},"Context is Infrastructure, Not Instructions","\u002Fnews\u002Finsights\u002Fcontext-is-infrastructure","news\u002Finsights\u002Fcontext-is-infrastructure",{"title":146,"path":147,"stem":148},"Context is the New Code","\u002Fnews\u002Finsights\u002Fcontext-is-new-code","news\u002Finsights\u002Fcontext-is-new-code",{"title":150,"path":151,"stem":152},"Continuous Thought Machines","\u002Fnews\u002Finsights\u002Fcontinuous-thought-machines","news\u002Finsights\u002Fcontinuous-thought-machines",{"title":154,"path":155,"stem":156},"Don't Vibe, Architect","\u002Fnews\u002Finsights\u002Fdont-vibe-architect","news\u002Finsights\u002Fdont-vibe-architect",{"title":158,"path":159,"stem":160},"The Edge of the Underdefined","\u002Fnews\u002Finsights\u002Fedge-of-the-underdefined","news\u002Finsights\u002Fedge-of-the-underdefined",{"title":162,"path":163,"stem":164},"Experts All the Way Down","\u002Fnews\u002Finsights\u002Fexperts-all-the-way","news\u002Finsights\u002Fexperts-all-the-way",{"title":166,"path":167,"stem":168},"A Multi-Tier Safety Architecture for Critical Applications","\u002Fnews\u002Finsights\u002Ffour-tier-architecture","news\u002Finsights\u002Ffour-tier-architecture",{"title":170,"path":171,"stem":172},"Green Dashboard, Unhappy Users","\u002Fnews\u002Finsights\u002Fgreen-dashboard-unhappy-users","news\u002Finsights\u002Fgreen-dashboard-unhappy-users",{"title":174,"path":175,"stem":176},"Hybrid Autoregressive Residual Tokens","\u002Fnews\u002Finsights\u002Fhart-model","news\u002Finsights\u002Fhart-model",{"title":178,"path":179,"stem":180},"Hierarchical Reasoning in Artificial Intelligence","\u002Fnews\u002Finsights\u002Fhierarchical-approaches","news\u002Finsights\u002Fhierarchical-approaches",{"title":182,"path":183,"stem":184},"Latent Diffusion for Language Generation: A Comprehensive Overview","\u002Fnews\u002Finsights\u002Flatent-diffusion-for-language","news\u002Finsights\u002Flatent-diffusion-for-language",{"title":186,"path":187,"stem":188},"Breaking Language Barriers: How AI Can Translate Without Examples","\u002Fnews\u002Finsights\u002Flearning-languages","news\u002Finsights\u002Flearning-languages",{"title":190,"path":191,"stem":192},"The Emergence of AI Deception: How Large Language Models Have Learned to Strategically Mislead Users","\u002Fnews\u002Finsights\u002Fllm-deception","news\u002Finsights\u002Fllm-deception",{"title":194,"path":195,"stem":196},"Grading on a Shared Curve","\u002Fnews\u002Finsights\u002Fllm-judge-correlated-errors","news\u002Finsights\u002Fllm-judge-correlated-errors",{"title":198,"path":199,"stem":200},"Synergizing Specialized Reasoning and General Capabilities in AI","\u002Fnews\u002Finsights\u002Fllm-reasoning-advances","news\u002Finsights\u002Fllm-reasoning-advances",{"title":202,"path":203,"stem":204},"The AI That Rewrites Itself: MIT's Breakthrough in Self-Adapting Language Models","\u002Fnews\u002Finsights\u002Fllm-seal","news\u002Finsights\u002Fllm-seal",{"title":206,"path":207,"stem":208},"Metacognitive Reinforcement Learning for Self-Improving AI Systems","\u002Fnews\u002Finsights\u002Fmetacognitive-reinforcement-learning","news\u002Finsights\u002Fmetacognitive-reinforcement-learning",{"title":210,"path":211,"stem":212},"Revolutionary Advancements in Mixture of Experts (MoE) Architectures","\u002Fnews\u002Finsights\u002Fmixture-of-experts","news\u002Finsights\u002Fmixture-of-experts",{"title":214,"path":215,"stem":216},"Balancing Neural Plasticity and Stability","\u002Fnews\u002Finsights\u002Fneural-plasticity","news\u002Finsights\u002Fneural-plasticity",{"title":218,"path":219,"stem":220},"Offline RL and the Data Flywheel","\u002Fnews\u002Finsights\u002Foffline-rl-data-flywheel","news\u002Finsights\u002Foffline-rl-data-flywheel",{"title":222,"path":223,"stem":224},"When Optimization Optimizes Itself","\u002Fnews\u002Finsights\u002Frecursive-goodhart","news\u002Finsights\u002Frecursive-goodhart",{"title":226,"path":227,"stem":228},"Reward Design as Architecture","\u002Fnews\u002Finsights\u002Freward-design-as-architecture","news\u002Finsights\u002Freward-design-as-architecture",{"title":230,"path":231,"stem":232},"When Success Has No Author: The Temporal Credit Assignment Problem","\u002Fnews\u002Finsights\u002Frl-credit-assignment-problem","news\u002Finsights\u002Frl-credit-assignment-problem",{"title":234,"path":235,"stem":236},"Beyond Entropy Collapse: When Exploration Succeeds but Learning Fails","\u002Fnews\u002Finsights\u002Frl-optimization-gaps","news\u002Finsights\u002Frl-optimization-gaps",{"title":238,"path":239,"stem":240},"The Path to Practical Confidential Computing for AI Systems","\u002Fnews\u002Finsights\u002Fsecure-ai-architectures","news\u002Finsights\u002Fsecure-ai-architectures",{"title":242,"path":243,"stem":244},"Guess First, Check Later","\u002Fnews\u002Finsights\u002Fspeculative-execution-pattern","news\u002Finsights\u002Fspeculative-execution-pattern",{"title":246,"path":247,"stem":248},"Spiking Neural Networks for Energy-Efficient AI","\u002Fnews\u002Finsights\u002Fspiking-neural-networks","news\u002Finsights\u002Fspiking-neural-networks",{"title":250,"path":251,"stem":252},"The Turn as the Unit of Quality","\u002Fnews\u002Finsights\u002Fstructured-iteration-quality","news\u002Finsights\u002Fstructured-iteration-quality",{"title":254,"path":255,"stem":256},"AI Speech Translation: Breaking Down Language Barriers","\u002Fnews\u002Finsights\u002Fsts-performance-advances","news\u002Finsights\u002Fsts-performance-advances",{"title":258,"path":259,"stem":260},"Test-Time Training Layers: The Next Evolution in Transformer Architecture","\u002Fnews\u002Finsights\u002Ftest-time-training-layers","news\u002Finsights\u002Ftest-time-training-layers",{"title":262,"path":263,"stem":264},"Breakthrough: Large Language Models Pass the Turing Test","\u002Fnews\u002Finsights\u002Fturing-tests","news\u002Finsights\u002Fturing-tests",{"title":266,"path":267,"stem":268},"Training in a World That Does Not Exist Yet","\u002Fnews\u002Finsights\u002Fworld-models-as-infrastructure","news\u002Finsights\u002Fworld-models-as-infrastructure",{"title":270,"path":271,"stem":272},"Privacy Policy","\u002Fprivacy","privacy",{"title":274,"path":275,"stem":276},"Research","\u002Fresearch","research",{"title":278,"path":279,"stem":280},"Terms of Service","\u002Fterms","terms",{"id":282,"title":162,"body":283,"date":601,"description":602,"extension":603,"image":604,"meta":605,"navigation":616,"path":163,"seo":617,"stem":164,"__hash__":618},"insights\u002Fnews\u002Finsights\u002Fexperts-all-the-way.md",{"type":284,"value":285,"toc":590},"minimark",[286,305,315,324,327,331,337,348,356,360,370,392,396,413,417,427,437,454,458,461],[287,288,291,292,291,298],"div",{"className":289},[290],"page-title","\n  ",[293,294,162],"h1",{"className":295,"id":297},[296],"page-title__main","experts-all-the-way-down",[299,300,304],"h2",{"className":301,"id":303},[302],"page-title__sub","recursive-hierarchical-gating-and-the-gap-between-a-1994-idea-and-todays-composed-models","Recursive Hierarchical Gating and the Gap Between a 1994 Idea and Today's Composed Models",[287,306,308,309],{"style":307},"width: 100%; padding: 2%;","\n    ",[310,311],"img",{"src":312,"alt":313,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1766364649443-9e6f73e71403?w=1200&auto=format&fit=crop","Bare tree branches dividing into ever finer twigs against the sky, analogous to a tree of gates that splits the input again at each level, with an expert waiting only at the final tip","width: 100%; height: auto;",[316,317,318,319,323],"p",{},"A mixture-of-experts model holds many specialized sub-networks, called experts, and a small gating network that decides which of them handle a given input. In the usual design the gate fires once. It looks at a token, picks a few experts, and the rest of the layer sits out, a setup covered in an ",[320,321,322],"a",{"href":211},"earlier overview of mixture-of-experts",". The idea worth examining keeps the gate but changes what happens after it decides. Rather than pick an expert and stop, the gate splits the incoming context into categories and forwards each piece to a more specialized system, and that system may carry its own gate, which splits again, to whatever depth helps. The decision becomes a tree rather than a single fork, and only at the leaves does an expert actually answer.",[316,325,326],{},"The structure branches the way a tree divides a trunk into limbs and limbs into twigs, each split finer than the last, with an expert waiting only at the final tip. Two features set it apart from a standard routing layer. The split is recursive rather than a single step, and the thing being routed is a chunk of meaning, a document or a sub-task, rather than an individual token. Whether that structure earns its added complexity, measured against a single flat model or a single flat routing step, is an open question with a longer history behind it than the recent interest suggests.",[299,328,330],{"id":329},"a-soft-decision-tree-circa-1994","A Soft Decision Tree, Circa 1994",[287,332,308,333],{"style":307},[310,334],{"src":335,"alt":336,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1601661783552-b125ad86e67d?w=1200&auto=format&fit=crop","Blue ink dispersing through water with no hard edge, analogous to a gating network whose region boundaries are soft, so an input can belong partly to several branches at once rather than landing in exactly one",[316,338,339,340,347],{},"The recursive version is not new. Jordan and Jacobs described it in 1994 as a tree-structured architecture in which gating networks sit at the branch points and experts sit at the leaves ",[341,342,343],"sup",{},[320,344,346],{"href":345},"#source-1","[1]",". Each gate partitions the input space, except the boundaries are soft, the way ink dropped into water spreads into a shared zone instead of stopping at a clean line, so an input can belong partly to several branches at once rather than landing in exactly one. The same construction repeats at every level, which yields a tree of arbitrary depth. They named the result a hierarchical mixture of experts and fit the whole thing with the expectation-maximization algorithm, a standard method for learning models that contain hidden structure. Their own phrase remains the clearest handle on the idea, a soft decision tree, where each node asks a fuzzy question and the answers blend instead of committing to one path.",[316,349,350,351,355],{},"That 1994 model routed fixed-length numeric inputs through simple linear experts, a long way from variable-length natural language. The worked examples ran two levels deep, though the authors noted the method extends to arbitrary depth ",[341,352,353],{},[320,354,346],{"href":345},". Whether the soft-partition mathematics survives the move to language, where the input is a long sequence with no fixed dimension and the categories are semantic rather than geometric, is the part that does not carry over for free.",[299,357,359],{"id":358},"expert-layers-versus-expert-models","Expert Layers Versus Expert Models",[316,361,362,363,369],{},"A distinction matters before going further. In a transformer mixture-of-experts the experts are feed-forward blocks inside one network, trained together, sharing a backbone, and routing picks among parts of a single model, which is the common case a recent survey of the area documents ",[341,364,365],{},[320,366,368],{"href":367},"#source-2","[2]",". A different design treats each expert as a whole, separately trained, separately deployable model and puts a router in front of the collection. The literature calls this composition of experts, a model of models rather than a layer of them.",[316,371,372,373,379,380,384,385,391],{},"One such system uses a single router over a pool of expert language models and reaches the quality of a much larger model while keeping average active parameters low, around 31 billion on one benchmark ",[341,374,375],{},[320,376,378],{"href":377},"#source-3","[3]",". Its routing runs in two steps, a category router first sorts the prompt into one of a fixed set of categories, then a lookup maps that category to the best expert ",[341,381,382],{},[320,383,378],{"href":377},". That is close to the category-splitting idea, yet the routing still resolves to a single expert in one pass. A second design encodes each expert model as a special token in a controller model's vocabulary, so choosing an expert looks like generating the next token, and it reports a few percent gain over earlier multi-model methods ",[341,386,387],{},[320,388,390],{"href":389},"#source-4","[4]",". Useful as these are, the router in each is a flat dispatch to one of several models. The recursive part, a chosen branch that is itself another gated system, is absent.",[299,393,395],{"id":394},"hierarchical-recursive-and-the-difference","Hierarchical, Recursive, and the Difference",[316,397,398,399,405,406,412],{},"Two recent lines of work carry the words hierarchical and recursive, and both deserve pinning down, because neither is the recursive category dispatch sketched above. One groups a model's experts and applies routing control at two coupled levels, balancing traffic across groups while encouraging specialization within them, and reports a modest perplexity gain and much better expert balance at the seven-billion scale ",[341,400,401],{},[320,402,404],{"href":403},"#source-5","[5]",". That is hierarchy inside one model's router, still operating on tokens. The other reuses a single shared stack of layers several times and lets a lightweight router decide how many passes each token takes, which saves parameters and compute ",[341,407,408],{},[320,409,411],{"href":410},"#source-6","[6]",". Its recursion is over depth of computation, how often a token revisits the same block, not over which specialized model handles which category of content. Recursive computation and recursive routing are easy to conflate and are not the same idea.",[299,414,416],{"id":415},"whether-the-extra-structure-pays","Whether the Extra Structure Pays",[316,418,419,420,426],{},"A prior question hides under all of this. If routing by meaning is valuable, do large models already do it on their own? A 2025 study probed several open mixture-of-experts models and found clear, statistically significant evidence that routing is sensitive to semantics, with expert overlap rising when meaning is preserved and falling when it changes, an effect strongest in the middle layers and growing with model size ",[341,421,422],{},[320,423,425],{"href":424},"#source-7","[7]",". The behavior looks learned and emergent rather than designed in. If category-like specialization arises on its own during ordinary training, a hand-built category gate has to justify itself against a baseline that already routes semantically without being asked to.",[316,428,429,430,436],{},"The statistical theory is encouraging but conditional. A recent analysis of hierarchical mixtures shows that the choice of gating function changes the outcome, that the familiar softmax gate creates parameter interactions which slow expert convergence, and that a different gating function removes them and sharpens specialization ",[341,431,432],{},[320,433,435],{"href":434},"#source-8","[8]",". A hierarchical gate can provably help, in other words, but only under the right design, and the wrong gate blunts the advantage the structure was meant to provide.",[316,438,439,440,446,447,453],{},"The simple version of the pattern already ships, though rarely more than two levels deep. A production audio assistant routes a query with a lightweight intent classifier to one of several specialized models, speech recognition, speaker identification, music tagging, then lets a small language model assemble the answer, and the cheap classifier beats a large model at the routing step ",[341,441,442],{},[320,443,445],{"href":444},"#source-9","[9]",". Nesting these systems further, a classifier whose chosen branch is itself a classifier-plus-experts system, is uncommon in published work. The closest the agent-orchestration literature comes is a hierarchical scheme that decomposes a task with a planning agent, instantiates specialized worker agents per sub-task, and searches over their arrangements, with double-digit accuracy gains reported on reasoning benchmarks ",[341,448,449],{},[320,450,452],{"href":451},"#source-10","[10]",". The decomposition there is genuinely multi-level, though it splits tasks rather than routing categories of context to standing expert models.",[299,455,457],{"id":456},"what-this-suggests","What This Suggests",[316,459,460],{},"The thirty-year arc is tidy in outline and unfinished in substance. The recursive, tree-structured gate was written down in 1994, and the modern pieces exist in scattered form, composition across whole models, hierarchical control inside one model, recursion over compute, emergent semantic routing, and conditional theory about when gating helps. What is missing is the join, a system that splits context by category and dispatches recursively across separately specialized models, more than two levels deep, with evidence that it beats a flat router. A recursive gate that dispatches to separate models would also inherit hard questions about where each leaf actually runs, since every leaf is a model that has to be served somewhere. The appeal of the idea is its plainness, a small classifier asking a question and asking it again. Whether that plainness holds once the splits are semantic, the experts are full models, and the tree is more than shallow, is still closer to a promising hypothesis than a settled result.",[287,462,291,466,291,469],{"className":463},[464,465],"references","mt-8",[299,467,468],{"id":464},"References",[470,471,308,477,308,495,308,507,308,517,308,527,308,538,308,549,308,560,308,570,308,580,291],"ol",{"className":472},[473,474,475,476],"list-decimal","list-inside","space-y-2","mt-4",[478,479,481,482,486,487],"li",{"id":480},"source-1","M. I. Jordan and R. A. Jacobs, \"Hierarchical Mixtures of Experts and the EM Algorithm,\" ",[483,484,485],"em",{},"Neural Computation",", vol. 6, no. 2, pp. 181–214, 1994. DOI: ",[320,488,494],{"href":489,"target":490,"className":491},"https:\u002F\u002Fdoi.org\u002F10.1162\u002Fneco.1994.6.2.181","_blank",[492,493],"text-blue-600","underline","[Online]",[478,496,498,499,502,503],{"id":497},"source-2","W. Cai et al., \"A Survey on Mixture of Experts in Large Language Models,\" ",[483,500,501],{},"arXiv",", 2024, ",[320,504,494],{"href":505,"target":490,"className":506},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2407.06204",[492,493],[478,508,510,511,502,513],{"id":509},"source-3","S. Jain et al., \"Composition of Experts: A Modular Compound AI System Leveraging Large Language Models,\" ",[483,512,501],{},[320,514,494],{"href":515,"target":490,"className":516},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.01868",[492,493],[478,518,520,521,502,523],{"id":519},"source-4","Z. Chai et al., \"An Expert is Worth One Token: Synergizing Multiple Expert LLMs as Generalist via Expert Token Routing,\" ",[483,522,501],{},[320,524,494],{"href":525,"target":490,"className":526},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2403.16854",[492,493],[478,528,530,531,533,534],{"id":529},"source-5","G. Molodtsov et al., \"Hierarchical Mixture-of-Experts with Two-Stage Optimization,\" ",[483,532,501],{},", 2026, ",[320,535,494],{"href":536,"target":490,"className":537},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2605.08292",[492,493],[478,539,541,542,544,545],{"id":540},"source-6","S. Bae et al., \"Mixture-of-Recursions: Learning Dynamic Recursive Depths for Adaptive Token-Level Computation,\" ",[483,543,501],{},", 2025, ",[320,546,494],{"href":547,"target":490,"className":548},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2507.10524",[492,493],[478,550,552,553,544,556],{"id":551},"source-7","M. L. Olson et al., \"Probing Semantic Routing in Large Mixture-of-Expert Models,\" in ",[483,554,555],{},"Findings of the Association for Computational Linguistics (EMNLP)",[320,557,494],{"href":558,"target":490,"className":559},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.10928",[492,493],[478,561,563,564,502,566],{"id":562},"source-8","H. Nguyen et al., \"On Expert Estimation in Hierarchical Mixture of Experts: Beyond Softmax Gating Functions,\" ",[483,565,501],{},[320,567,494],{"href":568,"target":490,"className":569},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2410.02935",[492,493],[478,571,573,574,502,576],{"id":572},"source-9","V. Naveen et al., \"Comprehensive Audio Query Handling System with Integrated Expert Models and Contextual Understanding,\" ",[483,575,501],{},[320,577,494],{"href":578,"target":490,"className":579},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2412.03980",[492,493],[478,581,583,584,544,586],{"id":582},"source-10","Z. Hou et al., \"HALO: Hierarchical Autonomous Logic-Oriented Orchestration for Multi-Agent LLM Systems,\" ",[483,585,501],{},[320,587,494],{"href":588,"target":490,"className":589},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2505.13516",[492,493],{"title":591,"searchDepth":592,"depth":592,"links":593},"",2,[594,595,596,597,598,599,600],{"id":303,"depth":592,"text":304},{"id":329,"depth":592,"text":330},{"id":358,"depth":592,"text":359},{"id":394,"depth":592,"text":395},{"id":415,"depth":592,"text":416},{"id":456,"depth":592,"text":457},{"id":464,"depth":592,"text":468},"2026-06-20","A small gate that does not just pick an expert but splits the input and asks again, recursively, before any expert answers, is an old idea with a 1994 pedigree. Today's composition-of-experts and hierarchical mixture-of-experts systems borrow pieces of it, though most still dispatch in a single flat step, which leaves the genuinely recursive, category-by-category version more proposed than proven.","md",{"src":312},{"authors":606,"badge":612,"source":614},[607],{"avatar":608,"name":610,"to":611},{"src":609},"\u002Fimg\u002Fmark_avatar.png","Mark Williams","https:\u002F\u002Fthinkata.com",{"label":613},"AI Architecture",{"name":615,"url":611},"Thinkata Research",true,{"title":162,"description":602},"rmJ0f3bH_I1RhEuR8713XDMklMJrd2942B33pzkapEo",[620,621],null,{"id":622,"title":170,"body":623,"date":843,"description":844,"extension":603,"image":845,"meta":846,"navigation":616,"path":171,"seo":853,"stem":172,"__hash__":854,"_path":171},"insights\u002Fnews\u002Finsights\u002Fgreen-dashboard-unhappy-users.md",{"type":284,"value":624,"toc":833},[625,637,643,652,655,659,665,673,676,680,688,691,695,703,711,715,721,729,732,736,763,767,770],[287,626,291,628,291,632],{"className":627},[290],[293,629,170],{"className":630,"id":631},[296],"green-dashboard-unhappy-users",[299,633,636],{"className":634,"id":635},[302],"why-passing-every-eval-doesnt-mean-your-ai-works","Why Passing Every Eval Doesn't Mean Your AI Works",[287,638,308,639],{"style":307},[310,640],{"src":641,"alt":642,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1762163516269-3c143e04175c?w=1200&auto=format&fit=crop","A rack of status lights all glowing green from edge to edge, analogous to a monitoring dashboard where every indicator reads healthy while the question that matters, whether the people on the other end are well served, is not one any of these lights measures",[316,644,645,646,648,649,651],{},"A team ships a model update. The evaluation suite runs, every check comes back green, the automated quality score clears its threshold. A week later the support queue is filling and the usage curve is bending the wrong way. The usual explanations for this gap are by now well documented, and most of them blame the test. The grader might share the model's blind spots, the problem traced in ",[320,647,194],{"href":195},". The single score might hide how much the system wobbles from one run to the next, the subject of ",[320,650,74],{"href":75},". Both are stories about a flawed instrument.",[316,653,654],{},"Set them aside for a moment. Suppose the eval is clean, stable, free of contamination, and scored honestly. A green dashboard can still sit above unhappy users for a reason that has nothing to do with the test being broken. The dashboard measures the model, by itself, on a fixed set of inputs, at one point in time. The user lives with the whole system, in back-and-forth, over weeks. The model, the human-AI team, and the running experience are three different things, and an offline eval only ever sees the first. Three fields that have been living this exact gap for years, recommender systems, human-computer interaction, and online experimentation, have both the evidence and the vocabulary worth borrowing.",[299,656,658],{"id":657},"the-soloist-and-the-duet","The Soloist and the Duet",[287,660,308,661],{"style":307},[310,662],{"src":663,"alt":664,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1516016906593-866c0d0356af?w=1200&auto=format&fit=crop","A man and a woman playing string instruments together as a duet, analogous to a human-AI team, where what matters is how the pair sounds together rather than how well either musician would score playing alone",[316,666,667,668,672],{},"A clean recording of a soloist says very little about how the duet will sound. The thing a user experiences is the duet, the model and the person working through a task together, and that joint performance is what an eval of the model alone leaves out. Bansal and colleagues studied this directly with an AI whose accuracy was comparable to the humans it assisted, asking whether the pair could reach complementary performance, where the team is more accurate than either the person or the model working solo ",[341,669,670],{},[320,671,346],{"href":345},". Adding the AI helped. Adding explanations of the AI's reasoning, the feature most often assumed to help further, did not increase complementarity. Instead, explanations raised the chance that people accepted the AI's recommendation regardless of whether it was correct.",[316,674,675],{},"That last detail is the uncomfortable one. A model can post a higher solo score and leave the team no better off, and a change meant to improve the experience can make the team worse by encouraging misplaced trust. The dashboard grades the soloist. The user hears the duet, and the duet has its own failure modes that the solo score cannot register.",[299,677,679],{"id":678},"offline-wins-do-not-always-survive-contact","Offline Wins Do Not Always Survive Contact",[316,681,682,683,687],{},"Recommender systems have run this comparison for real, with users instead of held-out data. Rossetti, Stella, and Zanker used a within-users design, the same people in both settings, to compare how algorithms ranked by offline accuracy against how they ranked in an online, user-centric study ",[341,684,685],{},[320,686,368],{"href":367},". The two rankings contradicted each other. The algorithm that looked best on the offline metric was significantly worse online at producing recommendations users actually found useful, meaning both relevant and novel. The authors put it plainly, the external validity of the most common offline evaluation method is not guaranteed.",[316,689,690],{},"That is roughly a decade of green dashboards over unhappy users, documented in a field that depends on getting recommendations right. The practical reading is modest and useful. An offline eval is a screen, a cheap way to decide which candidates deserve a more expensive test. It is not the verdict. Treating the offline ranking as the answer is how a system clears every check and still disappoints the people it was built for.",[299,692,694],{"id":693},"the-proxy-you-reach-for-online-bites-back","The Proxy You Reach For Online Bites Back",[316,696,697,698,702],{},"The obvious response is to stop measuring proxies and measure users. The catch is that online measurement is also a proxy. What gets logged is behavior, clicks, session length, a thumbs up, whether someone came back, and behavior is not the same as value. Kleinberg, Mullainathan, and Raghavan modeled what happens when a well-meaning platform optimizes engagement while users hold inconsistent preferences, wanting one thing in the moment and another on reflection ",[341,699,700],{},[320,701,378],{"href":377},". Engagement and genuine utility can pull apart. Their image for it is chips and salad. For some content, the fact that people consume more of it tracks real value, the way the most-watched calculus tutorial may simply be the best one. For other content, consuming more signals a pull people would not endorse on reflection, the way a bag of chips disappears. Users can spend long sessions and get little from them, and a change can lift engagement for a while before they quit abruptly.",[316,704,705,706,710],{},"Time makes this harder to read. Drawing from online experiments, Kohavi and colleagues catalog why short experiments mislead, including novelty and primacy effects, where a change looks like a win for the first couple of weeks mainly because it is new, then fades once the novelty wears off ",[341,707,708],{},[320,709,390],{"href":389},". Their central advice is to choose an Overall Evaluation Criterion, a metric deliberately tied to long-term value rather than a short-term bump, and to run experiments long enough to see past the early effect. A green two-week A\u002FB test and a satisfied user a quarter later are different claims, and only one of them is on the dashboard.",[299,712,714],{"id":713},"some-of-works-was-never-going-into-a-test","Some of \"Works\" Was Never Going Into a Test",[287,716,308,717],{"style":307},[310,718],{"src":719,"alt":720,"style":314},"https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1555992457-720eb4e75880?w=1200&auto=format&fit=crop","An empty retro diner lit by neon, analogous to silent churn, where the empty booths record that customers left but never record why, so the remaining feedback comes only from the people who stayed",[316,722,723,724,728],{},"Underneath all of this sits a limit that no amount of better instrumentation removes. Ackerman, in a foundational paper on computer-supported work, named the socio-technical gap, the divide between what a system needs to support socially and what it can actually be built to support technically, because human activity is flexible, nuanced, and context-dependent while technical mechanisms stay rigid ",[341,725,726],{},[320,727,404],{"href":403},". For an AI product, part of whether it works for a particular person lives in context an eval cannot enumerate, the user's unstated goal, how much weight they place on a confidently wrong answer, the stakes of the moment, the social setting the output lands in. Some of that is not in the test set, and some of it could not be put there.",[316,730,731],{},"An empty diner sharpens the point. The bare booths record that people left, not why they left. Dissatisfied users mostly do not file tickets, they simply stop coming back, so the feedback that would correct the dashboard is filtered by who chose to stay. The dashboard hears from the survivors and infers contentment from their presence, which is exactly the population least likely to report the problem that drove everyone else away.",[299,733,735],{"id":734},"measuring-the-team-and-the-experience","Measuring the Team and the Experience",[316,737,738,739,743,744,748,749,753,757,758,762],{},"None of this argues against evals, and none of it asks for a single new number. It argues for widening what the dashboard is allowed to count as evidence. Scoring the human-AI team rather than the model alone, by tracking whether the pair outperforms either side solo and whether reliance on the model is appropriate rather than automatic, brings the duet back into view ",[341,740,741],{},[320,742,346],{"href":345},". Treating an offline eval as a screen that earns a candidate the right to an online test, and letting the online result rather than the offline ranking make the call, respects what recommender systems learned the hard way ",[341,745,746],{},[320,747,368],{"href":367},". Choosing a long-term evaluation criterion, watching for behavioral proxies that drift away from it, and running experiments past the novelty window keep the online dashboard from telling a flattering short story ",[341,750,751],{},[320,752,378],{"href":377},[341,754,755],{},[320,756,390],{"href":389},". And instrumenting the interaction itself, recovery after a bad turn, escalation rate, repeat use, numbers broken out by user segment rather than averaged into one cheerful figure, narrows the part of the gap that can be narrowed, while a human channel stays open for the part that cannot ",[341,759,760],{},[320,761,404],{"href":403},".",[299,764,766],{"id":765},"what-the-green-actually-certifies","What the Green Actually Certifies",[316,768,769],{},"A green offline dashboard certifies something narrow, and stating it plainly is half the cure. The model did well on a fixed set of inputs, by itself, at one moment, measured by a quantity that may or may not be the one users feel. It does not certify that the human-AI team did well, that the result holds up against live traffic, that the proxy being optimized is the thing users actually value, or that the unmeasurable remainder is being handled at all. The distance between scoring a model and serving a person is where the unhappy users are, and that distance does not show up on a dashboard built to watch the model. Closing it starts with measuring the second thing, not assuming the first one stands in for it.",[287,771,291,773,291,775],{"className":772},[464,465],[299,774,468],{"id":464},[470,776,308,778,308,789,308,800,308,811,308,822,291],{"className":777},[473,474,475,476],[478,779,780,781,784,785],{"id":480},"G. Bansal et al., \"Does the Whole Exceed its Parts? The Effect of AI Explanations on Complementary Team Performance,\" in ",[483,782,783],{},"Proc. 2021 CHI Conference on Human Factors in Computing Systems (CHI '21)",", 2021. DOI: ",[320,786,494],{"href":787,"target":490,"className":788},"https:\u002F\u002Fdoi.org\u002F10.1145\u002F3411764.3445717",[492,493],[478,790,791,792,795,796],{"id":497},"M. Rossetti, F. Stella, and M. Zanker, \"Contrasting Offline and Online Results when Evaluating Recommendation Algorithms,\" in ",[483,793,794],{},"Proc. 10th ACM Conference on Recommender Systems (RecSys '16)",", 2016, pp. 31–34. DOI: ",[320,797,494],{"href":798,"target":490,"className":799},"https:\u002F\u002Fdoi.org\u002F10.1145\u002F2959100.2959176",[492,493],[478,801,802,803,806,807],{"id":509},"J. Kleinberg, S. Mullainathan, and M. Raghavan, \"The Challenge of Understanding What Users Want: Inconsistent Preferences and Engagement Optimization,\" ",[483,804,805],{},"Management Science",", vol. 70, no. 9, pp. 6336–6355, 2024. DOI: ",[320,808,494],{"href":809,"target":490,"className":810},"https:\u002F\u002Fdoi.org\u002F10.1287\u002Fmnsc.2022.03683",[492,493],[478,812,813,814,817,818],{"id":519},"R. Kohavi et al., \"Trustworthy Online Controlled Experiments: Five Puzzling Outcomes Explained,\" in ",[483,815,816],{},"Proc. 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '12)",", 2012, pp. 786–794. DOI: ",[320,819,494],{"href":820,"target":490,"className":821},"https:\u002F\u002Fdoi.org\u002F10.1145\u002F2339530.2339653",[492,493],[478,823,824,825,828,829],{"id":529},"M. S. Ackerman, \"The Intellectual Challenge of CSCW: The Gap Between Social Requirements and Technical Feasibility,\" ",[483,826,827],{},"Human-Computer Interaction",", vol. 15, no. 2-3, pp. 179–203, 2000. DOI: ",[320,830,494],{"href":831,"target":490,"className":832},"https:\u002F\u002Fdoi.org\u002F10.1207\u002FS15327051HCI1523_5",[492,493],{"title":591,"searchDepth":592,"depth":592,"links":834},[835,836,837,838,839,840,841,842],{"id":635,"depth":592,"text":636},{"id":657,"depth":592,"text":658},{"id":678,"depth":592,"text":679},{"id":693,"depth":592,"text":694},{"id":713,"depth":592,"text":714},{"id":734,"depth":592,"text":735},{"id":765,"depth":592,"text":766},{"id":464,"depth":592,"text":468},"2026-06-13","Suppose the eval is clean, stable, and honestly scored. A green dashboard can still sit above unhappy users, because it measures the model alone, on fixed inputs, at one moment, while the user lives with the whole system, in back-and-forth, over weeks. Three fields that have lived this gap have the evidence for it.",{"src":641},{"authors":847,"badge":850,"source":852},[848],{"avatar":849,"name":610,"to":611},{"src":609},{"label":851},"AI Evaluation",{"name":615,"url":611},{"title":170,"description":844},"jU0wYYIVBiy-h3OtKktxcE5wMAD-MmLUJv5ZS3Qbqis",1782047594408]