<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1661444</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Functional partitioning through competitive learning</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Tacke</surname> <given-names>Marius</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3122774/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Busch</surname> <given-names>Matthias</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Linka</surname> <given-names>Kevin</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1366449/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Cyron</surname> <given-names>Christian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Aydin</surname> <given-names>Roland</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Institute of Material Systems Modeling, Helmholtz-Zentrum Hereon</institution>, <addr-line>Geesthacht</addr-line>, <country>Germany</country></aff>
<aff id="aff2"><sup>2</sup><institution>Institute for Continuum and Material Mechanics, Hamburg University of Technology</institution>, <addr-line>Hamburg</addr-line>, <country>Germany</country></aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2912679/overview">Yang Hu</ext-link>, University of Oxford, United Kingdom</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1495056/overview">Opeoluwa Owoyele</ext-link>, Louisiana State University, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3148251/overview">Yingbin Chen</ext-link>, The University of Iowa, United States</p>
</fn>
<corresp id="c001">&#x0002A;Correspondence: Marius Tacke <email>marius.tacke&#x00040;hereon.de</email></corresp>
<corresp id="c002">Roland Aydin <email>roland.aydin&#x00040;hereon.de</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>05</day>
<month>11</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1661444</elocation-id>
<history>
<date date-type="received">
<day>07</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>16</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Tacke, Busch, Linka, Cyron and Aydin.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Tacke, Busch, Linka, Cyron and Aydin</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Datasets often incorporate various functional patterns related to different aspects or regimes, which are typically not equally present throughout the dataset. We propose a novel partitioning algorithm that utilizes competition between models to detect and separate these functional patterns. This competition is induced by multiple models iteratively submitting their predictions for the dataset, with the best prediction for each data point being rewarded with training on that data point. This reward mechanism amplifies each model&#x00027;s strengths and encourages specialization in different patterns. The specializations can then be translated into a partitioning scheme. We validate our concept with datasets with clearly distinct functional patterns, such as mechanical stress and strain data in a porous structure. Our partitioning algorithm produces valuable insights into the datasets&#x00027; structure, which can serve various further applications. As a demonstration of one exemplary usage, we set up modular models consisting of multiple expert models, each learning a single partition, and compare their performance on more than twenty popular regression problems with single models learning all partitions simultaneously. Our results show significant improvements, with up to 56% loss reduction, confirming our algorithm&#x00027;s utility.</p></abstract>
<kwd-group>
<kwd>partitioning</kwd>
<kwd>clustering</kwd>
<kwd>unsupervised learning</kwd>
<kwd>machine learning</kwd>
<kwd>competitive learning</kwd>
</kwd-group>
<counts>
<fig-count count="9"/>
<table-count count="1"/>
<equation-count count="7"/>
<ref-count count="64"/>
<page-count count="12"/>
<word-count count="7689"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Datasets can include multiple sections that adhere to distinct regimes. For instance, in stress-strain tests of materials, the initial phase exhibits elastic behavior, which is reversible. However, if the material is stretched further, it enters a phase of plastic behavior, resulting in permanent changes. Similarly, self-driving cars face unique challenges when navigating construction zones, which may be specific to certain regions of the parameter space, just as they do on highways or country roads. This mixture of functional patterns affects how difficult datasets are for models to learn. Typically, the more diverse the patterns within a dataset, the more challenging it is for a model to achieve high accuracy. In this work, we present a novel partitioning algorithm that detects such functional patterns and, when possible, separates them.</p>
<p>Given these mixed regimes, the modeling task can be viewed in two steps: first, split the domain, then build a model that covers all parts. In practice, these steps are often implemented within a single process, but they can also be separated. The first step&#x02014;standalone domain splitting&#x02014;is known as clustering. A classic example is k-means (<xref ref-type="bibr" rid="B33">Macqueen, 1967</xref>). Most clustering methods group points by an assumed similarity measure. In k-means, spatial proximity defines similarity. K-means iterates between assigning each point to its nearest centroid and updating centroids to the mean of assigned points. Extensions include fuzzy c-means for soft assignments (<xref ref-type="bibr" rid="B12">Dunn, 1974</xref>) and game-based k-means, which strengthens competition among centroids for samples (<xref ref-type="bibr" rid="B45">Rezaee et al., 2021</xref>). Clustering has been extensively studied; the surveys by (<xref ref-type="bibr" rid="B24">Jain 2010</xref>), (<xref ref-type="bibr" rid="B11">Du 2010</xref>), (<xref ref-type="bibr" rid="B1">Aggarwal and Reddy 2013</xref>), and (<xref ref-type="bibr" rid="B14">Ezugwu et al. 2021</xref>) provide broader overviews.</p>
<p>A classical approach that unifies domain splitting with modeling is the mixture of experts (MoE), introduced by (<xref ref-type="bibr" rid="B23">Jacobs et al. 1991</xref>). In MoE, a gating network makes soft partitions of the input space and routes samples to local experts. Training is often carried out with the expectation maximization (EM) algorithm. The latent responsibilities decouple gate and expert updates and induce competitive learning, so experts that better explain a sample are rewarded and specialization emerges. The hierarchical MoE by (<xref ref-type="bibr" rid="B26">Jordan and Jacobs 1994</xref>) extends this idea with tree-structured gating, which increases modularity and enables progressively refined splits. Subsequent work explored localized gates based on Gaussian densities, which yield analytic updates for the gate and faster training while preserving competition among experts (<xref ref-type="bibr" rid="B59">Xu et al., 1994</xref>). To manage overfitting and model complexity, variational and Bayesian formulations place distributions over parameters, improving regularization and model selection while maintaining competitive allocation of data (<xref ref-type="bibr" rid="B55">Waterhouse et al., 1995</xref>; <xref ref-type="bibr" rid="B53">Ueda and Ghahramani, 2002</xref>). Stability in multiclass settings has been analyzed, and remedies such as small learning rates and expectation conditional maximization (ECM) style separate updates have been shown to sustain specialization despite parameter coupling (<xref ref-type="bibr" rid="B4">Chen et al., 1999</xref>; <xref ref-type="bibr" rid="B38">Ng and McLachlan, 2004</xref>). Beyond neural experts, MoE has been combined with support vector machines (SVMs) and Gaussian processes (GP), including a mixture of GP experts that assign regions of the input space to different GP components. These combinations improve flexibility and scalability for nonstationary data (<xref ref-type="bibr" rid="B35">Meeds and Osindero, 2005</xref>; <xref ref-type="bibr" rid="B62">Yuan and Neubauer, 2008</xref>; <xref ref-type="bibr" rid="B32">Lima et al., 2007</xref>; <xref ref-type="bibr" rid="B50">Tresp, 2000</xref>). Extensions to time series and sequential data augment gates and experts with temporal structure and allow partitions to evolve over time (<xref ref-type="bibr" rid="B56">Weigend et al., 1995</xref>; <xref ref-type="bibr" rid="B3">Chen et al., 1996</xref>). For an accessible orientation to developments over the past two decades, see the survey of (<xref ref-type="bibr" rid="B63">Yuksel et al. 2012</xref>). (<xref ref-type="bibr" rid="B48">Shazeer et al. 2017</xref>) provided a recent efficiency proof by realizing conditional computation at scale. They introduced sparsely gated MoE layers with thousands of feedforward experts and routed only a few per example, which yielded very large capacity at modest computational cost and state-of-the-art results in language modeling and machine translation.</p>
<p>Beyond the classical MoE approach, several ensemble methods pursue localization and specialization without a gating network. The self-organizing map by (<xref ref-type="bibr" rid="B29">Kohonen 1990</xref>) uses competitive learning to arrange prototypes on a low-dimensional lattice, which promotes local specialization and is widely used for clustering and visualization. Iterative splitting methods repeatedly partition the dataset and spawn new models when accuracy remains insufficient, so experts emerge that specialize on different regions (<xref ref-type="bibr" rid="B19">Gordon and Crouson, 2008</xref>). (<xref ref-type="bibr" rid="B64">Zhang and Liu 2002</xref>) introduced the one prototype take one cluster paradigm (OPTOC), which creates models as needed and lets them compete for data points, and (<xref ref-type="bibr" rid="B58">Wu et al. 2004</xref>) adapted it to gene expression clustering.</p>
<p>There is fast-growing work on sparse MoE for large language models (LLMs) that aims to expand capacity without increasing compute per token. As one example, (<xref ref-type="bibr" rid="B10">Do et al. 2025</xref>) study routing in transformer-based MoE and propose USMoE that compares token choice and expert choice. Building on (<xref ref-type="bibr" rid="B48">Shazeer et al. 2017</xref>), (<xref ref-type="bibr" rid="B15">Fedus et al. 2022</xref>) integrate MoE into the transformer with a switch feedforward layer, enabling many more parameters at modest per-token compute. Refining this method, (<xref ref-type="bibr" rid="B42">Pham et al. 2024</xref>) address expert collapse and routing imbalance with winner-takes-all competition based on actual expert activations and with a separate router trained to predict these outcomes, which improves routing and representation diversity. For first-stage retrieval, (<xref ref-type="bibr" rid="B20">Guo et al. 2025</xref>) combine specialized lexical, local, and global matching experts with competitive training to balance effectiveness and efficiency.</p>
<p>Beyond applications, two recent theoretical studies develop mathematical foundations for MoE, analyzing when they succeed on clustered tasks and linking EM training to mirror descent with convergence guarantees (<xref ref-type="bibr" rid="B27">Kawata et al., 2025</xref>; <xref ref-type="bibr" rid="B18">Fruytier et al., 2024</xref>). (<xref ref-type="bibr" rid="B31">Li et al. 2022</xref>) explore feature-level rather than sample-level MoE using soft subspace clustering to assign features to multiple specialists rather than clustering samples. (<xref ref-type="bibr" rid="B6">Cort&#x000E9;s et al. 2025</xref>) apply multiple choice learning with a winner takes all loss to time series forecasting, and (<xref ref-type="bibr" rid="B39">Nikolic et al. 2025</xref>) use sparse MoE variational autoencoders to study unsupervised specialization. (<xref ref-type="bibr" rid="B43">Piwko et al. 2025</xref>) propose hellsemble, an ensemble that partitions data by difficulty and trains specialists on progressively harder subsets. The sequential variant follows a fixed order and passes misclassified instances forward, while the greedy variant selects at each step the model that yields the largest validation gain. In contrast to our approach, hellsemble is largely sequential, with later models correcting earlier errors, whereas our experts operate fully in parallel. (<xref ref-type="bibr" rid="B30">Krishnamurthy et al. 2023</xref>) show that classical MoE can yield unintuitive and imbalanced decompositions when the gate and the experts are trained jointly, which weakens specialization. They address this with attentive gating that leverages expert outputs and with data-driven similarity regularization to encourage balanced routing, an important issue they pursue along a different path than we do. (<xref ref-type="bibr" rid="B13">Eigen et al. 2013</xref>) study deep MoE with two expert layers and a gating network for each layer, showing that the layers specialize in different aspects while keeping a fixed expert set and joint training. In another line of work, (<xref ref-type="bibr" rid="B40">Oldfield et al. 2024</xref>) design a generic MoE block that can be integrated into diverse architectures and that remains fully differentiable, using dense input-dependent routing rather than discrete selection to make the component plug and play. Finally, (<xref ref-type="bibr" rid="B54">Ukorigho and Owoyele 2025</xref>) present a competition-based model discovery approach close in spirit to ours, where models compete for data points and the winner discourages others from capturing similar samples to sharpen specialization. Key differences to our work include how the number of models is chosen, since they keep adding models while validation loss improves, whereas we add and drop models using explicit criteria based on the hardest samples and redundancy among specialists. Another difference is the training schedule, as they couple routing and expert optimization, while we separate partitioning because this partitioning enables a wide range of other uses, with analysis of regimes and active sampling as two examples. Furthermore, they evaluate on structurally different tasks than we do.</p>
<p>We propose an alternative to the classical mixture of experts: in our approach, multiple models compete for each data point. The model with the most accurate prediction is rewarded with training on that point, which drives specialization. The resulting expert preferences define a partitioning of the dataset. In this paper, we use that partitioning to build a modular model with one expert per region, and we compare it to a single global model.</p>
<p>Methods such as the iterative splitting of (<xref ref-type="bibr" rid="B19">Gordon and Crouson 2008</xref>) and the hellsemble framework of (<xref ref-type="bibr" rid="B43">Piwko et al. 2025</xref>) organize competition rather in a sequential split and refine loop than in parallel. While they are highly valuable for growing models, they actually react to residual error and capacity limits as the specialization they induce follows difficulty rather than stable semantic regimes. Our goal is different. We aim to expose regimes that arise from how different learners naturally win on different subsets. A central difference to classical MoE is our two-step design. We first partition the dataset, then we learn the final experts on the induced regions. This separation gives wide freedom in how to design the partitioning. In the present work, the partition is driven purely by competition. Compared to (<xref ref-type="bibr" rid="B54">Ukorigho and Owoyele 2025</xref>), we use this freedom to establish flexible adding and dropping modules that adjust the number of experts automatically. The framework also allows for great flexibility regarding the model class hyperparameter settings within the competition.</p>
<p>The partitioning we obtain enables multiple secondary uses, such as facilitating data analysis or enabling efficient sampling strategies. Consider a scenario where sampling is expensive because each data point requires a costly experiment. One could collect data in batches and rerun the partitioning after each batch. After training a separate expert for each region, regions with underperforming experts could be interpreted as harder and thus prioritized for additional sampling. This approach aligns with the paradigm of active learning, where models are deliberately exposed to data points they are most uncertain about in order to improve their weaknesses. Our approach, however, inverts this idea. In our competition-based design, models do not train on their weaknesses but on their strengths: they are rewarded with training on those data points they predict most accurately. This deliberate choice drives specialization and induces the resulting partitioning of the dataset. Rather than seeking to reduce uncertainty, we exploit certainty to create a structured division of the data, which can then support downstream tasks such as expert modeling or targeted analysis.</p></sec>
<sec sec-type="materials and methods" id="s2">
<title>2 Materials and methods</title>
<sec>
<title>2.1 Partitioning algorithm</title>
<p>The objective of our approach is to detect functional patterns in datasets and separate them in case they appear separable. To achieve this, we propose competition among multiple models. We intentionally refer to models in a general sense, as our approach is not limited by the type of model used. However, for simplicity, one might consider simple feedforward networks as an example. The models compete for data points, which requires them to specialize in certain functional patterns of the dataset. This specialization can be translated into a partitioning of the dataset.</p>
<p>Given the dataset:</p>
<disp-formula id="E1"><mml:math id="M1"><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>we assume that the input features <italic>x</italic><sub><italic>i</italic></sub> and the output labels <italic>y</italic><sub><italic>i</italic></sub> are known. However, we assume that both the number of partitions and the location of their boundaries are unknown. We start with <italic>K</italic> models in the competition: Let <inline-formula><mml:math id="M2"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:mrow><mml:mi mathvariant="script">X</mml:mi></mml:mrow><mml:mo>&#x02192;</mml:mo><mml:mi>&#x0211D;</mml:mi></mml:math></inline-formula> denote the <italic>k</italic>-th model prediction, parameterized by &#x003B8;<sub><italic>k</italic></sub>, where &#x003B8;<sub><italic>k</italic></sub> represents the set of model parameters (e.g., weights and biases):</p>
<disp-formula id="E2"><mml:math id="M3"><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>For each data point in the dataset, all models submit their predictions. The prediction error for each model and data point is calculated like this:</p>
<disp-formula id="E3"><mml:math id="M4"><mml:mrow><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>Each data point is assigned to the model whose prediction is closest to the true value, formally expressed as:</p>
<disp-formula id="E4"><mml:math id="M5"><mml:mrow><mml:mi>a</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo class="qopname">&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>thereby also defining the subset of the dataset assigned to each model:</p>
<disp-formula id="E5"><mml:math id="M6"><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x02223;</mml:mo><mml:mi>a</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>As a reward for providing the most accurate prediction, the winning model is allowed to update its parameters using this subset of data points for one training epoch. The corresponding mean squared error, which in the case of neural networks is backpropagated through the network for optimization, is defined as:</p>
<disp-formula id="E6"><mml:math id="M7"><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>The global mean squared error can be expressed as:</p>
<disp-formula id="E7"><mml:math id="M8"><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x00398;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>However, this global loss is not used for optimization, as there is no trainable gating mechanism; instead, the partitioning of the dataset emerges from the competitive interaction among the networks. <xref ref-type="table" rid="T2">Algorithm 1</xref> describes the implementation of this idea. A corresponding flowchart is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<table-wrap position="float" id="T2"> 
<label>Algorithm 1</label>
<caption><p>Partitioning: best predictions are rewarded with training.</p></caption>
<table frame="hsides" rules="groups">
<tbody>
<tr><td align="left" valign="top"><monospace> &#x000A0;procedure <sc>main</sc></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0; for each <italic>epoch</italic> <bold>do</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0; &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;for each <italic>model</italic> <bold>do</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; Submit predictions for all data points.</monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; end <bold>for</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; for each <italic>datapoint</italic> <bold>do</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; Rank models according to their predictions.</monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; end <bold>for</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; for each <italic>model</italic> <bold>do</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; Train for one epoch with all won data points.</monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; end <bold>for</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0; end <bold>for</bold></monospace> </td></tr>
<tr><td align="left" valign="top"><monospace> &#x000A0;end <bold>procedure</bold></monospace></td></tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Flow chart of the partitioning algorithm: each data pointed is assigned to the model that submitted the best prediction. All models are trained with the data points in their partition for one epoch. This process is iterated.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0001.tif">
<alt-text>Flowchart illustrating a machine learning ensemble process. A dataset is input into three models: A, B, and C. Their outputs feed into a decision node labeled Best prediction, which directs results into three partitions: A, B, and C, each represented with pie charts.</alt-text>
</graphic>
</fig>
<p>This process&#x02014;models submitting predictions, ranking the predictions, and training the models on the data points for which they provided the best predictions&#x02014;is iterated. We call one such iteration an epoch of the algorithm. As the models specialize, we expect the assignments of data points to models to stabilize: a specialized expert will usually submit the best predictions for its domain. After a predefined number of epochs, the assignments of data points to models are considered final. Each model&#x00027;s won data points translate to a separate partition of the dataset. The hyperplanes between the partitions are stored in a support vector machine (SVM), making the partitioning technically available for other applications. Snapshots of the application of the algorithm to a one-dimensional function that we designed as a test dataset are shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. The transition from random predictions at the beginning to specialized experts at the end is clearly visible. The assignments of data points to the specialized experts are translated into the final partitioning.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Exemplary partitioning. <bold>(a)</bold> Presents the self-designed test dataset, while <bold>(b)</bold> displays an exemplary partitioning result. <bold>(c)</bold> Illustrates the partitioning process, transitioning from networks with initial random predictions to the orange, red, and green networks each capturing distinct patterns. The process involves adding and removing networks as patterns are identified or networks deemed redundant.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0002.tif">
<alt-text>Three graphs depict the evolution of network mapping. Graph (a) shows a function with fluctuations. Graph (b) illustrates the same function with three networks overlayed. Graph (c) contains three panels: initial mapping at epoch zero with multiple networks, a bar chart showing network evolution across epochs, and final mapping at epoch one thousand with select networks.</alt-text>
</graphic>
</fig>
<p>Since the number of partitions is usually unknown beforehand, the partitioning algorithm includes an adding and a dropping mechanism to dynamically adapt the number of competing models to the dataset. To evaluate whether a new model should be added to the competition, we regularly identify the data points with the poorest predictions in the dataset and train a new model on these points. The new model is added to the competition in case that improves the overall loss. <xref ref-type="fig" rid="F3">Figure 3</xref> demonstrates the addition of a model that successfully captures a significant portion of the sinusoidal section of a test function, which had previously been unlearned. For more details, see the pseudo-code of the adding mechanism in <xref ref-type="supplementary-material" rid="SM1">Appendix Algorithm 2</xref>. Conversely, redundant models that do not uniquely capture their own pattern should be eliminated. Such redundancy is indicated by models not winning any data points or by their predictions significantly overlapping with those of other models. The degree of redundancy is assessed by the increase in overall loss if the model were deleted. This factor is regularly checked, and all highly redundant models are removed. <xref ref-type="fig" rid="F4">Figure 4</xref> demonstrates the removal of the red model, as it only captures data points similarly well as the purple model. <xref ref-type="supplementary-material" rid="SM1">Appendix Algorithm 3</xref> provides the corresponding pseudo-code. The adding and dropping mechanism are designed to balance each other. <xref ref-type="fig" rid="F2">Figure 2</xref> shows exemplary how the number of competing models is adapted to the dataset from initially ten to finally three. This process involves both adding new models to capture previously unlearned patterns and removing redundant ones.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Adding a new network (red network 12) to the competition. Regularly, a new network is trained using the data points with the poorest predictions at that time. If the new network improves the overall loss, it is added to the competition. Here, the red network 12 is the first to capture the sinusoidal pattern.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0003.tif">
<alt-text>Two graphs compare function mappings at epochs 235 and 236. Both display multiple network outputs with distinct colored lines. An arrow highlights a circled region showing changes in network behavior between the epochs.</alt-text>
</graphic>
</fig>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Dropping a network (red network 12) from the competition as it appears redundant, failing to capture any patterns uniquely. Regularly, for each model, we check how much the overall loss would increase if the network were removed. If the increase is small, the corresponding network is considered redundant and is discarded. Here, the red network&#x00027;s predictions were too similar to the purple network&#x00027;s predictions.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0004.tif">
<alt-text>Side-by-side graphs show mappings after epochs 959 and 960, illustrating the behavior of functions labeled as Function and various networks: Net 9, Net 12, Net 47, and Net 53. A black dashed oval highlights changes, with a black arrow indicating evolution between epochs.</alt-text>
</graphic>
</fig>
<p>A significant asset of our partitioning algorithm is its ability to extend to a pattern-adaptive model type, architecture, and hyperparameter search without incurring additional costs. So far, competing models have been considered similar in terms of their type, architecture, and hyperparameter settings. However, all three can be randomly varied among the models, as it is reasonable to assume that different patterns may require, for example, wider neural networks or smaller learning rates. Consequently, the algorithm&#x00027;s output can not only be a partitioning but also an optimal configuration of model type, architecture, and hyperparameters for each partition.</p></sec>
<sec>
<title>2.2 Modular model</title>
<p>Applying the partitioning algorithm to datasets reveals interesting and valuable insights about the dataset&#x00027;s structure, as illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. Additionally, the partitioning can be utilized for various other purposes, such as learning the dataset using a divide-and-conquer approach. Traditionally, the entire dataset is used to train and optimize a single model. However, if the partitioning algorithm detects distinct functional patterns, it may be beneficial to have multiple expert models, each learning only one pattern, instead of pressing all patterns into a single model. Therefore, multiple expert models that each learn one partition are combined into a modular model. The SVM, which incorporates the boundaries between the partitions, serves as a switch between the experts. For each data point, the SVM decides which partition it belongs to and, consequently, which expert model to train or test. The structure of the modular model is illustrated with a flowchart in <xref ref-type="fig" rid="F5">Figure 5</xref>. With this approach, we believe that we can reduce model complexity and increase model accuracy for datasets that are structured by multiple distinct functional patterns with little overlap.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Flow chart of the modular model: each partition is learned by a separate expert model. For each data point, the SVM as a result of the partitioning algorithm decides which expert to train or to test. This way, the experts are combined to a modular model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0005.tif">
<alt-text>Flowchart showing a modular model. A dataset is divided into partitions A, B, and C, represented by pie charts. These partitions are processed by SVM (Support Vector Machine), visualized by a scatter plot icon, and then assigned to experts A, B, and C, illustrated by neural network symbols.</alt-text>
</graphic>
</fig>
<p>To evaluate this approach, we compared the performance of a single model trained on the entire dataset with that of a modular model comprising multiple expert models. We speak of models in general, as the type of model can be varied. In our experiments, we used feedforward neural networks. To ensure a fair comparison, we allowed the single model to have as many trainable parameters (weights and biases) as the combined total of all experts in the modular model. We conducted a hyperparameter optimization for each expert and separately for the single model serving as the baseline. To keep the hyperparameter search space manageable, we limited the search to the most influential parameters and applied reasonable constraints: the number of layers was varied between 2 and 6, the number of neurons per layer between 4 and 10, and the learning rate between 0.0001 and 0.005. All other hyperparameters were fixed at values listed in <xref ref-type="supplementary-material" rid="SM1">Appendix Table 2</xref>. Within this reduced search space, we performed 100 grid search trials for each expert model and each single model. This process ensures that any advantages or disadvantages are not due to unfitting parameters or outliers. To estimate the stability of both approaches, we repeated each run, which&#x02014;partitioning the dataset, training the modular model including hyperparameter optimization, and training the single model including hyperparameter optimization - ten times.</p></sec>
<sec>
<title>2.3 Datasets</title>
<p>We designed one-dimensional, section-wise defined functions to serve as test datasets for validating the effectiveness of our approach and its implementation. The anomaly-crest function is illustrated in <xref ref-type="fig" rid="F2">Figure 2a</xref>, and the wave-climb function is depicted in <xref ref-type="fig" rid="F6">Figure 6a</xref>. Due to their section-wise definition, these functions exhibit different local functional patterns, akin to several engineering problems. One such example is modeling the stress-strain curves of materials with porous structures. These materials offer an excellent balance between weight and strength, but their stress-strain curves are typically challenging to model due to the presence of diverse functional patterns. An exemplary stress-strain curve for such a material is shown in <xref ref-type="fig" rid="F6">Figure 6b</xref>. The data for this porous structure&#x00027;s stress-strain curve were generously provided by (<xref ref-type="bibr" rid="B2">Ambekar et al. 2021</xref>), who collected them. We have observed a high robustness of our partitioning approach to variations in the models random initializations. <xref ref-type="fig" rid="F2">Figures 2</xref>, <xref ref-type="fig" rid="F6">6</xref> illustrate typical results.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Datasets to test the partitioning algorithm, illustrated with exemplary partitioning results. <bold>(a)</bold> Self-designed wave-climb function with three patterns identified by the algorithm (gray, green, blue). <bold>(b)</bold> Porous structure&#x00027;s stress-strain dataset generously provided by (<xref ref-type="bibr" rid="B2">Ambekar et al. 2021</xref>) with three patterns identified by the algorithm (red, green, orange).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0006.tif">
<alt-text>Graph a shows a curve with oscillations on the left, stabilizing and rising sharply on the right, using a color gradient from grey to yellow to blue. Graph b depicts compressive stress versus strain, starting low in red, increasing steadily in orange and green, ending higher around 5 MPa at 24% strain.</alt-text>
</graphic>
</fig>
<p>In addition to the one-dimensional datasets, we evaluated our method using 22 popular higher-dimensional real-world datasets from the UCI Machine Learning Repository (<xref ref-type="bibr" rid="B28">Kelly et al., 2024</xref>). Our tests focused exclusively on regression problems, though our approach can be readily extended to classification problems. Acknowledging that our assumption of distinct and separable characteristics may not apply to all datasets, we tested these 22 additional datasets to assess the frequency and extent to which the modular model, based on the partitioning algorithm, outperforms a single model (<xref ref-type="bibr" rid="B22">Imran et al., 2020</xref>; <xref ref-type="bibr" rid="B8">Cortez et al., 2009</xref>; <xref ref-type="bibr" rid="B37">Nash et al., 1995</xref>; <xref ref-type="bibr" rid="B41">Palechor and la Hoz Manotas, 2019</xref>; <xref ref-type="bibr" rid="B47">Schlimmer, 1987</xref>; <xref ref-type="bibr" rid="B9">Cortez and Morais, 2008</xref>; <xref ref-type="bibr" rid="B16">Feldmesser, 1987</xref>; <xref ref-type="bibr" rid="B61">Yeh, 2018</xref>; E and Cho, <xref ref-type="bibr" rid="B46">2020</xref>; <xref ref-type="bibr" rid="B52">Tsanas and Xifara, 2012</xref>; <xref ref-type="bibr" rid="B60">Yeh, 2007</xref>; <xref ref-type="bibr" rid="B49">Tfekci and Kaya, 2014</xref>; <xref ref-type="bibr" rid="B7">Cortez, 2014</xref>; <xref ref-type="bibr" rid="B44">Quinlan, 1993</xref>; <xref ref-type="bibr" rid="B34">Matzka, 2020</xref>; <xref ref-type="bibr" rid="B57">Wolberg et al., 1995</xref>; <xref ref-type="bibr" rid="B17">Fernandes et al., 2015</xref>; <xref ref-type="bibr" rid="B25">Janosi et al., 1988</xref>; <xref ref-type="bibr" rid="B51">Tsanas and Little, 2009</xref>; <xref ref-type="bibr" rid="B5">Chen, 2017</xref>; <xref ref-type="bibr" rid="B36">Moro et al., 2016</xref>; <xref ref-type="bibr" rid="B21">Hamidieh, 2018</xref>). A characterization of all test cases is provided in <xref ref-type="supplementary-material" rid="SM1">Appendix Table 3</xref>.</p></sec></sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p>We evaluated the predictions of both approaches using mean squared error (MSE) and R<sup>2</sup>. We expected our pipeline of partitioning algorithm and modular model to outperform the single model in some, but not all test cases. This was confirmed: the pipeline showed clear advantages in 6 out of 25 cases. For the two synthetic test functions, the modular model outperformed the single model by orders of magnitude, validating the concept. On the porous structure&#x00027;s stress-strain data, which inspired the test functions, the modular model reduced the test MSE by 54% on average over 10 runs. The modular model also showed strong performance on three real-world datasets. On the energy efficiency dataset, it achieved a 56% reduction in test MSE, on the automobile dataset, 29%, and on the student performance dataset, 10%, all averaged over 10 runs.</p>
<p><xref ref-type="fig" rid="F7">Figure 7</xref> shows histograms of test MSE for the modular and single models. <xref ref-type="fig" rid="F8">Figure 8</xref> shows the same predictions evaluated with R<sup>2</sup>, offering a more intuitive illustration of the performance. Both figures focus on the six datasets where the modular model had a significant advantage. Each histogram displays results from ten runs per model. The x-axis shows either test MSE or R<sup>2</sup>; the y-axis shows the number of runs achieving each value. Higher bars on the left indicate better performance.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Histograms illustrating the test mean squared error (MSE) of single and modular model for ten runs with each of the six selected datasets. The higher the bars on the left side, the better the performance.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0007.tif">
<alt-text>Six bar charts compare the performance of single and modular models in terms of test mean squared error (MSE) across different functions or datasets. Each chart displays the number of runs and the mean MSE for both models, with blue bars representing the single model and orange for the modular model. Overlap is also shown. Titles indicate the focus: Anomaly-crest function, Wave-climb function, Porous structures stress-strain curve, Automobile insurance risk, Energy efficiency, and Students Portuguese grades.</alt-text>
</graphic>
</fig>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Histograms illustrating the test R<sup>2</sup> scores of single and modular model for ten runs with each of the six selected datasets. The higher the bars on the left side, the better the performance.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0008.tif">
<alt-text>Six bar charts compare test R-squared values for single and modular models across different functions. Each chart shows number of runs versus test R-squared values. Functions include anomaly-crest, wave-climb, porous structures, automobile insurance risk, energy efficiency, and students Portuguese grades. Mean R-squared values are listed for each model. Bars represent overlaps, modular models, and single models in different colors.</alt-text>
</graphic>
</fig>
<p><xref ref-type="table" rid="T1">Table 1</xref> summarizes the six datasets shown in the histograms, listing features, labels, and data points. <xref ref-type="supplementary-material" rid="SM1">Appendix Table 3</xref> includes this information for all 25 datasets and is placed in the appendix due to its length.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Characterization of the six datasets in <xref ref-type="fig" rid="F7">Figures 7</xref> and <xref ref-type="fig" rid="F8">8</xref>.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>URL</bold></th>
<th valign="top" align="left"><bold>Synthetic</bold></th>
<th valign="top" align="center"><bold>&#x00023; features</bold></th>
<th valign="top" align="center"><bold>&#x00023; labels</bold></th>
<th valign="top" align="center"><bold>&#x00023; samples</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Anomaly-Crest function</td>
<td valign="top" align="left"><ext-link ext-link-type="uri" xlink:href="https://github.com/FunctionalPartitioning/FunctionalPartitioning">URL</ext-link></td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">10,000</td>
</tr> <tr>
<td valign="top" align="left">Wave-climb function</td>
<td valign="top" align="left"><ext-link ext-link-type="uri" xlink:href="https://github.com/FunctionalPartitioning/FunctionalPartitioning">URL</ext-link></td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">10,000</td>
</tr> <tr>
<td valign="top" align="left">Automobile insurance</td>
<td valign="top" align="left"><ext-link ext-link-type="uri" xlink:href="https://archive.ics.uci.edu/dataset/10/automobile">URL</ext-link></td>
<td valign="top" align="left">No</td>
<td valign="top" align="center">25</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">205</td>
</tr> <tr>
<td valign="top" align="left">Energy efficiency</td>
<td valign="top" align="left"><ext-link ext-link-type="uri" xlink:href="https://archive.ics.uci.edu/dataset/242/energy&#x0002B;efficiency">URL</ext-link></td>
<td valign="top" align="left">No</td>
<td valign="top" align="center">8</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">768</td>
</tr> <tr>
<td valign="top" align="left">Students&#x00027; grades</td>
<td valign="top" align="left"><ext-link ext-link-type="uri" xlink:href="https://archive.ics.uci.edu/dataset/320/student&#x0002B;performance">URL</ext-link></td>
<td valign="top" align="left">No</td>
<td valign="top" align="center">30</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">649</td>
</tr>
<tr>
<td valign="top" align="left">Stress-strain curve</td>
<td valign="top" align="left"><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/adem.202001428">URL</ext-link></td>
<td valign="top" align="left">No</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">4,065</td>
</tr></tbody>
</table>
</table-wrap>
<p>Since training efficiency is key in machine learning, we also compared the training times of the modular model and single model approach. We measured the time required for a 100-trial grid search for hyperparameter tuning. For the modular model, we also included the time to run the partitioning algorithm. <xref ref-type="fig" rid="F9">Figure 9</xref> presents a bar plot of training times for the six highlighted datasets. The x-axis shows the datasets, the y-axis (log scale) shows computation time in seconds on a standard desktop computer (Intel Core i9-11950H &#x00040; 2.60GHz, 64GB RAM, NVIDIA RTX A5000 with 24GB VRAM). Compared to the hyperparameter search, the partitioning algorithm adds negligible time. More impactful is the modular model&#x00027;s use of multiple but smaller models, which speeds up backpropagation. Overall, training times are similar, with a slight advantage for the modular model.</p>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Bar plot showing the computation time for the training of both the single-model and modular-model approaches. For the modular model, the total time includes the execution of the partitioning algorithm. For both approaches, the time required to perform a 100-trial grid search for hyperparameter optimization is included.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1661444-g0009.tif">
<alt-text>Bar chart titled "Computation time comparison" showing computation time in seconds on a logarithmic scale. Categories include anomaly-crust function, wave-climb function, automobile insurance, energy efficiency, student grades, and stress-strain curve. It compares single and modular models. Single model generally takes longer, especially in anomaly-crust and wave-climb functions, whereas modular model is faster in energy efficiency.</alt-text>
</graphic>
</fig>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>As introduced in Section 2.1, the partitioning algorithm is based on the competition between multiple models: iteratively, each model is trained on the data points for which it provided the best predictions. The progressive reinforcement of each model&#x00027;s strengths drives their specialization, which we exploit to partition the dataset. Through competition, the models naturally align with distinct functional patterns in the data.</p>
<p>The application of our partitioning algorithm to the anomaly-crest function demonstrates that the competition between multiple models is generally effective for developing specialized experts and separating different functional patterns. The primary value of this partitioning lies in its ability to detect these distinct patterns and provide insights into the dataset&#x00027;s structure. For the anomaly-crest function, the four identified sections clearly differ in their functional characteristics (see <xref ref-type="fig" rid="F2">Figure 2</xref>). In the case of the wave-climb function, the algorithm successfully separates the two sinusoidal sections with different frequencies and amplitudes, as well as a final u-shaped section, which seems reasonable (see <xref ref-type="fig" rid="F6">Figure 6a</xref>). For the porous structure&#x00027;s stress-strain dataset, it is noteworthy that the first hook is identified as a distinct pattern. Subsequently, all sections with concave curvature are captured by the green model, while all sections with convex curvature are captured by the orange model. This partitioning was surprising, but it appears that the models find it easier to learn either concave or convex curvatures exclusively (see <xref ref-type="fig" rid="F6">Figure 6b</xref>). The models themselves detecting which functional patterns can be learned well coherently was exactly what we were aiming for.</p>
<p>One potential concern is that a single model might, due to a lucky initialization, dominate the competition and suppress the emergence of specialized models. On the one hand, our adding mechanism tackles this by relying on relative performance: new networks are iteratively trained on the samples with the least accurate predictions. Because this threshold is relative to the best models performance rather than absolute, a newly initialized model, trained specifically on challenging samples, can always outperform the current model and enter the competition. That said, we do not claim that every dataset can be effectively partitioned using our approach. Some datasets exhibit a single coherent pattern or contain overlapping patterns that resist separation. If a single model consistently outperforms others, it may simply reflect that the dataset is best modeled holistically. In such cases, we view it as a strength of our method that it naturally converges to a single domain, signaling to the user that partitioning is not beneficial and that a unified model may be more appropriate. This behavior aligns with our results: while the modular model was not superior across all datasets, it outperformed the single model on six out of the 25 datasets tested. For the porous structure&#x00027;s stress-strain dataset and the energy efficiency dataset, the modular model achieved a loss reduction of over 50% (see <xref ref-type="fig" rid="F7">Figure 7</xref>). These findings support our hypothesis that for datasets with separable patterns, specialized expert models can offer significant advantages over a single unified model.</p>
<p>Even in cases where the modular model achieves lower average loss than the single model, the histograms show that individual trials can still favor the single model. This variability arises in part from randomness in the partitioning process: although the algorithm tends to converge to similar partitions, some runs produce splits that are more effective than others. More importantly, both approaches are influenced by stochastic factors during training, such as initialization and sample shuffling, which naturally lead to performance variance. Additionally, since we are dealing with standard feedforward neural networks and standard optimization algorithms, we also encounter standard challenges, such as models getting stuck in local minima, which can affect individual outcomes.</p>
<p>In <xref ref-type="supplementary-material" rid="SM1">Appendix Section 1</xref>, we describe a detailed analysis of the factors contributing to the performance of the modular model. Our findings reveal a correlation between the number of patterns identified by the partitioning algorithm and the modular model&#x00027;s performance: the more distinct patterns in the dataset, the better the modular model performs relative to the single model. This aligns with our expectation that not all datasets are suitable for our approach. The partitioning algorithm should primarily be applied to datasets that are expected to exhibit predominant patterns with minimal overlap. The clearer the patterns, the more effective the modular model is expected to be. Additionally, we examined the impact of our pattern-adaptive hyperparameter search, which optimizes the hyperparameter settings for each pattern. We discovered that tailoring the learning rates to each partition enhances the modular model&#x00027;s performance. However, our results indicate that adjusting the numbers of layers and neurons per layer for each pattern does not provide any significant advantage. Finally, we aimed to verify that the partitioning algorithm identifies substantial patterns rather than merely separating small and challenging snippets. Our results confirm that the more homogeneous the partition proportions, the more successful the modular model tends to be.</p>
<p>While this study exclusively uses feedforward neural networks, our framework is not limited to this model type. Since competition is moderated solely by prediction accuracy, the approach is flexible enough to incorporate a wide range of models, from simple linear regressors to very complex architectures such as large language models (LLMs). This generality opens up opportunities for future experiments with diverse model types, depending on dataset characteristics.</p>
<p>There are numerous potential applications of our partitioning, many of which we may not have yet considered. We found it important to illustrate a path that leads to measurable improvements by leveraging our partitioning results. One application we plan to explore in the future is using the partitioning algorithm for active learning. In the context of expensive data points, the following data collection loop could be advantageous: first, collect a batch of data points; then, apply the partitioning algorithm; and finally, train each partition with a separate model, akin to the modular model approach. Instead of immediately combining their predictions, we could assess each expert&#x00027;s performance and adjust the collection of new data points accordingly. Partitions that are more challenging to learn should receive more data points, while easier partitions should receive fewer. This approach could lead to a more efficient use of the data point budget. The process can be repeated iteratively. For instance, with a budget of 500 data points, we could run this process 10 times, each time distributing 50 data points according to the difficulty of the experts in learning their partitions in the last iteration.</p></sec>
<sec sec-type="conclusions" id="s5">
<title>5 Conclusions</title>
<p>In this paper, we introduced a novel partitioning algorithm. To the best of our knowledge, this algorithm is unique in its use of competition between models to generate a general-purpose partitioning scheme, without constraints on the dataset&#x00027;s origin or order. The partitioning is achieved by having multiple models iteratively submit their predictions for all points in the dataset and being rewarded for the best predictions with training on the corresponding data points. This process induces specialization in the models, which is then translated into a partitioning.</p>
<p>We demonstrated that our algorithm is both widely applicable and useful. Its wide applicability was shown by valuable results across datasets of varying dimensionalities, sparsities, and contexts&#x02014;from student education to engineering stress-strain tests. The utility of our algorithm was illustrated in two primary ways: first, the partitioning inherently provides insights into the dataset&#x00027;s structure. For instance, three distinct patterns were detected in the porous structure&#x00027;s stress-strain dataset: an initial hook, convex, and concave parts. Second, certain datasets can be learned more accurately with a modular model based on our partitioning algorithm than with a single model. If a model&#x00027;s accuracy in learning a dataset is unsatisfactory and the dataset is likely structured along predominant patterns with little overlap, we recommend applying our pipeline of the partitioning algorithm and modular model. Particularly in the context of expensive data points, improving the model on this path without adding more data points can be financially beneficial. In the future, we will explore a third application: adapting data collection strategies based on our partitioning algorithm.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The data for the section-wise defined functions is available at <ext-link ext-link-type="uri" xlink:href="https://github.com/FunctionalPartitioning/FunctionalPartitioning">https://github.com/FunctionalPartitioning/FunctionalPartitioning</ext-link>. The stress-strain curve data for the porous structure is available upon request from (<xref ref-type="bibr" rid="B2">Ambekar et al. 2021</xref>), the original data collectors. All the high-dimensional, real-world datasets used as benchmarks to evaluate the effectiveness of our approach can be obtained from the UCI Machine Learning Repository (<xref ref-type="bibr" rid="B28">Kelly et al., 2024</xref>).</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>MT: Writing &#x02013; original draft, Software, Validation, Methodology, Visualization, Investigation. MB: Writing &#x02013; review &#x00026; editing. KL: Writing &#x02013; review &#x00026; editing. CC: Writing &#x02013; review &#x00026; editing, Project administration, Conceptualization. RA: Conceptualization, Writing &#x02013; review &#x00026; editing, Supervision, Project administration.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. MB gratefully acknowledges funding by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation)-Projektnummer 535656357. KL gratefully acknowledges funding by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) &#x02013; Projektnummer 533187597.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2025.1661444/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frai.2025.1661444/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aggarwal</surname> <given-names>C. C.</given-names></name> <name><surname>Reddy</surname> <given-names>C. K.</given-names></name></person-group> (<year>2013</year>). <source>Data Clustering: Algorithms and Applications</source>. London: CRC Press. Taylor &#x00026;Francis Group. <pub-id pub-id-type="doi">10.1201/b15410</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ambekar</surname> <given-names>R. S.</given-names></name> <name><surname>Mohanty</surname> <given-names>I.</given-names></name> <name><surname>Kishore</surname> <given-names>S.</given-names></name> <name><surname>Das</surname> <given-names>R.</given-names></name> <name><surname>Pal</surname> <given-names>V.</given-names></name> <name><surname>Kushwaha</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Atomic scale structure inspired 3D-printed porous structures with tunable mechanical response</article-title>. <source>Adv. Eng. Mater</source>. <volume>23</volume>:<fpage>2001428</fpage>. <pub-id pub-id-type="doi">10.1002/adem.202001428</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>K.</given-names></name> <name><surname>Xie</surname> <given-names>D.</given-names></name> <name><surname>Chi</surname> <given-names>H.</given-names></name></person-group> (<year>1996</year>). <article-title>A modified hme architecture for text-dependent speaker identification</article-title>. <source>IEEE Trans. Neural Netw</source>. <volume>7</volume>, <fpage>1309</fpage>&#x02013;<lpage>1313</lpage>. <pub-id pub-id-type="doi">10.1109/72.536325</pub-id><pub-id pub-id-type="pmid">18263525</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>K.</given-names></name> <name><surname>Xu</surname> <given-names>L.</given-names></name> <name><surname>Chi</surname> <given-names>H.</given-names></name></person-group> (<year>1999</year>). <article-title>Improved learning algorithms for mixture of experts in multiclass classification</article-title>. <source>Neural Netw</source>. <volume>12</volume>, <fpage>1229</fpage>&#x02013;<lpage>1252</lpage>. <pub-id pub-id-type="doi">10.1016/S0893-6080(99)00043-X</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <source>Beijing PM2.5</source>. <publisher-loc>UCI Machine Learning Repository</publisher-loc>.</citation>
</ref>
<ref id="B6">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cort&#x000E9;s</surname> <given-names>A.</given-names></name> <name><surname>Rehm</surname> <given-names>R.</given-names></name> <name><surname>Letzelter</surname> <given-names>V.</given-names></name></person-group> (<year>2025</year>). <article-title>&#x0201C;Winner-takes-all for multivariate probabilistic time series forecasting,&#x0201D;</article-title> in <source>ICML 2025: The 42nd International Conference on Machine Learning</source>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cortez</surname> <given-names>P.</given-names></name></person-group> (<year>2014</year>). <source>Student Performance</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cortez</surname> <given-names>P.</given-names></name> <name><surname>Cerdeira</surname> <given-names>A.</given-names></name> <name><surname>Almeida</surname> <given-names>F.</given-names></name> <name><surname>Matos</surname> <given-names>T.</given-names></name> <name><surname>Reis</surname> <given-names>J.</given-names></name></person-group> (<year>2009</year>). <source>Wine Quality</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cortez</surname> <given-names>P.</given-names></name> <name><surname>Morais</surname> <given-names>A.</given-names></name></person-group> (<year>2008</year>). <source>Forest Fires.</source> UCI Machine Learning Repository.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Do</surname> <given-names>G.</given-names></name> <name><surname>Le</surname> <given-names>H.</given-names></name> <name><surname>Tran</surname> <given-names>T.</given-names></name></person-group> (<year>2025</year>). <article-title>Sparse mixture of experts as unified competitive learning</article-title>. <source>arXiv preprint arXiv:2503.22996</source>.</citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Du</surname> <given-names>K.-L.</given-names></name></person-group> (<year>2010</year>). <article-title>Clustering: a neural network approach</article-title>. <source>Neural Netw</source>. <volume>23</volume>, <fpage>89</fpage>&#x02013;<lpage>107</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2009.08.007</pub-id><pub-id pub-id-type="pmid">19758784</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dunn</surname> <given-names>J. C.</given-names></name></person-group> (<year>1974</year>). <article-title>Well-separated clusters and optimal fuzzy partitions</article-title>. <source>J. Cybern</source>. <volume>4</volume>, <fpage>95</fpage>&#x02013;<lpage>104</lpage>. <pub-id pub-id-type="doi">10.1080/01969727408546059</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Eigen</surname> <given-names>D.</given-names></name> <name><surname>Ranzato</surname> <given-names>M.</given-names></name> <name><surname>Sutskever</surname> <given-names>I.</given-names></name></person-group> (<year>2013</year>). <article-title>Learning factored representations in a deep mixture of experts</article-title>. <source>arXiv preprint arXiv:1312.4314</source>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ezugwu</surname> <given-names>A. E.</given-names></name> <name><surname>Shukla</surname> <given-names>A. K.</given-names></name> <name><surname>Agbaje</surname> <given-names>M. B.</given-names></name> <name><surname>Oyelade</surname> <given-names>O. N.</given-names></name> <name><surname>Jos&#x000E9;-Garc&#x000ED;a</surname> <given-names>A.</given-names></name> <name><surname>Agushaka</surname> <given-names>J. O.</given-names></name></person-group> (<year>2021</year>). <article-title>Automatic clustering algorithms: a systematic review and bibliometric analysis of relevant literature</article-title>. <source>Neural Comput. Applic</source>. <volume>33</volume>, <fpage>6247</fpage>&#x02013;<lpage>6306</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-020-05395-4</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Fedus</surname> <given-names>W.</given-names></name> <name><surname>Zoph</surname> <given-names>B.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name></person-group> (<year>2022</year>). <article-title>Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity</article-title>. <source>J. Mach. Learn. Res</source>. <volume>23</volume>, <fpage>1</fpage>&#x02013;<lpage>39</lpage>. <ext-link ext-link-type="uri" xlink:href="http://jmlr.org/papers/v23/21-0998.html">http://jmlr.org/papers/v23/21-0998.html</ext-link></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Feldmesser</surname> <given-names>J.</given-names></name></person-group> (<year>1987</year>). <source>Computer Hardware</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fernandes</surname> <given-names>K.</given-names></name> <name><surname>Vinagre</surname> <given-names>P.</given-names></name> <name><surname>Cortez</surname> <given-names>P.</given-names></name> <name><surname>Sernadela</surname> <given-names>P.</given-names></name></person-group> (<year>2015</year>). <source>Online News Popularity</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fruytier</surname> <given-names>Q.</given-names></name> <name><surname>Mokhtari</surname> <given-names>A.</given-names></name> <name><surname>Sanghavi</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Learning mixtures of experts with em: a mirror descent perspective</article-title>. <source>arXiv preprint arXiv:2411.06056</source>.</citation>
</ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Gordon</surname> <given-names>V. S.</given-names></name> <name><surname>Crouson</surname> <given-names>J.</given-names></name></person-group> (<year>2008</year>). <article-title>&#x0201C;Self-splitting modular neural network-domain partitioning at boundaries of trained regions,&#x0201D;</article-title> in <source>2008 IEEE International Joint Conference on Neural Networks (IEEE World Congress on Computational Intelligence)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1085</fpage>&#x02013;<lpage>1091</lpage>. <pub-id pub-id-type="doi">10.1109/IJCNN.2008.4633934</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>J.</given-names></name> <name><surname>Cai</surname> <given-names>Y.</given-names></name> <name><surname>Bi</surname> <given-names>K.</given-names></name> <name><surname>Fan</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>W.</given-names></name> <name><surname>Zhang</surname> <given-names>R.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Came: competitively learning a mixture-of-experts model for first-stage retrieval</article-title>. <source>ACM Trans. Inf. Syst</source>. <volume>43</volume>, <fpage>1</fpage>&#x02013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1145/3757737</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hamidieh</surname> <given-names>K.</given-names></name></person-group> (<year>2018</year>). <source>Superconductivty Data</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Imran</surname> <given-names>A. A.</given-names></name> <name><surname>Rahim</surname> <given-names>M. S.</given-names></name> <name><surname>Ahmed</surname> <given-names>T.</given-names></name></person-group> (<year>2020</year>). <source>Productivity Prediction of Garment Employees</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jacobs</surname> <given-names>R. A.</given-names></name> <name><surname>Jordan</surname> <given-names>M. I.</given-names></name> <name><surname>Nowlan</surname> <given-names>S. J.</given-names></name> <name><surname>Hinton</surname> <given-names>G. E.</given-names></name></person-group> (<year>1991</year>). <article-title>Adaptive mixtures of local experts</article-title>. <source>Neural Comput</source>. <volume>3</volume>, <fpage>79</fpage>&#x02013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1162/neco.1991.3.1.79</pub-id><pub-id pub-id-type="pmid">31141872</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jain</surname> <given-names>A. K.</given-names></name></person-group> (<year>2010</year>). <article-title>Data clustering: 50 years beyond k-means</article-title>. <source>Pattern Recognit. Lett</source>. <volume>31</volume>, <fpage>651</fpage>&#x02013;<lpage>666</lpage>. <pub-id pub-id-type="doi">10.1016/j.patrec.2009.09.011</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Janosi</surname> <given-names>A.</given-names></name> <name><surname>Steinbrunn</surname> <given-names>W.</given-names></name> <name><surname>Pfisterer</surname> <given-names>M.</given-names></name> <name><surname>Detrano</surname> <given-names>R.</given-names></name></person-group> (<year>1988</year>). <source>Heart Disease</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jordan</surname> <given-names>M. I.</given-names></name> <name><surname>Jacobs</surname> <given-names>R. A.</given-names></name></person-group> (<year>1994</year>). <article-title>Hierarchical mixtures of experts and the em algorithm</article-title>. <source>Neural Comput</source>. <volume>6</volume>, <fpage>181</fpage>&#x02013;<lpage>214</lpage>. <pub-id pub-id-type="doi">10.1162/neco.1994.6.2.181</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kawata</surname> <given-names>R.</given-names></name> <name><surname>Matsutani</surname> <given-names>K.</given-names></name> <name><surname>Kinoshita</surname> <given-names>Y.</given-names></name> <name><surname>Nishikawa</surname> <given-names>N.</given-names></name> <name><surname>Suzuki</surname> <given-names>T.</given-names></name></person-group> (<year>2025</year>). <article-title>Mixture of experts provably detect and learn the latent cluster structure in gradient-based learning</article-title>. <source>arXiv preprint arXiv:2506.01656</source>.</citation>
</ref>
<ref id="B28">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Kelly</surname> <given-names>M.</given-names></name> <name><surname>Longjohn</surname> <given-names>R.</given-names></name> <name><surname>Nottingham</surname> <given-names>K.</given-names></name></person-group> (<year>2024</year>). <source>Uci Machine Learning Repository</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://archive.ics.uci.edu">https://archive.ics.uci.edu</ext-link></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kohonen</surname> <given-names>T.</given-names></name></person-group> (<year>1990</year>). <article-title>The self-organizing map</article-title>. <source>Proc. IEEE</source> <volume>78</volume>, <fpage>1464</fpage>&#x02013;<lpage>1480</lpage>. <pub-id pub-id-type="doi">10.1109/5.58325</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Krishnamurthy</surname> <given-names>Y.</given-names></name> <name><surname>Watkins</surname> <given-names>C.</given-names></name> <name><surname>Gaertner</surname> <given-names>T.</given-names></name></person-group> (<year>2023</year>). <article-title>Improving expert specialization in mixture of experts</article-title>. <source>arXiv preprint arXiv:2302.14703</source>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <name><surname>Qiao</surname> <given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Design of a modular neural network based on an improved soft subspace clustering algorithm</article-title>. <source>Expert Syst. Appl</source>. <volume>209</volume>:<fpage>118219</fpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2022.118219</pub-id></citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lima</surname> <given-names>C. A.</given-names></name> <name><surname>Coelho</surname> <given-names>A. L.</given-names></name> <name><surname>Von Zuben</surname> <given-names>F. J.</given-names></name></person-group> (<year>2007</year>). <article-title>Hybridizing mixtures of experts with support vector machines: Investigation into nonlinear dynamic systems identification</article-title>. <source>Inf. Sci</source>. <volume>177</volume>, <fpage>2049</fpage>&#x02013;<lpage>2074</lpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2007.01.009</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Macqueen</surname> <given-names>J.</given-names></name></person-group> (<year>1967</year>). <source>Some Methods for Classification and Analysis of Multivariate Observations</source>. <publisher-loc>Oakland, CA</publisher-loc>: <publisher-name>University of California Press</publisher-name>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Matzka</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <source>AI4I 2020 Predictive Maintenance Dataset</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Meeds</surname> <given-names>E.</given-names></name> <name><surname>Osindero</surname> <given-names>S.</given-names></name></person-group> (<year>2005</year>). <article-title>&#x0201C;An alternative infinite mixture of gaussian process experts,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, 18.</citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Moro</surname> <given-names>S.</given-names></name> <name><surname>Rita</surname> <given-names>P.</given-names></name> <name><surname>Vala</surname> <given-names>B.</given-names></name></person-group> (<year>2016</year>). <source>Facebook Metrics</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nash</surname> <given-names>W.</given-names></name> <name><surname>Sellers</surname> <given-names>T.</given-names></name> <name><surname>Talbot</surname> <given-names>S.</given-names></name> <name><surname>Cawthorn</surname> <given-names>A.</given-names></name> <name><surname>Ford</surname> <given-names>W.</given-names></name></person-group> (<year>1995</year>). <source>Abalone. UCI Machine Learning Repository</source>.</citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ng</surname> <given-names>S.-K.</given-names></name> <name><surname>McLachlan</surname> <given-names>G. J.</given-names></name></person-group> (<year>2004</year>). <article-title>Using the em algorithm to train neural networks: misconceptions and a new algorithm for multiclass classification</article-title>. <source>IEEE Trans. Neural Netw</source>. <volume>15</volume>, <fpage>738</fpage>&#x02013;<lpage>749</lpage>. <pub-id pub-id-type="doi">10.1109/TNN.2004.826217</pub-id><pub-id pub-id-type="pmid">15384560</pub-id></citation></ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nikolic</surname> <given-names>S.</given-names></name> <name><surname>Oguz</surname> <given-names>I.</given-names></name> <name><surname>Psaltis</surname> <given-names>D.</given-names></name></person-group> (<year>2025</year>). <article-title>Exploring expert specialization through unsupervised training in sparse mixture of experts</article-title>. <source>arXiv preprint arXiv:2509.10025</source>.</citation>
</ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oldfield</surname> <given-names>J.</given-names></name> <name><surname>Georgopoulos</surname> <given-names>M.</given-names></name> <name><surname>Chrysos</surname> <given-names>G.</given-names></name> <name><surname>Tzelepis</surname> <given-names>C.</given-names></name> <name><surname>Panagakis</surname> <given-names>Y.</given-names></name> <name><surname>Nicolaou</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>&#x0201C;Multilinear mixture of experts: Scalable expert specialization through factorization,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, <fpage>53022</fpage>&#x02013;<lpage>53063</lpage>.</citation>
</ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Palechor</surname> <given-names>F. M.</given-names></name> <name><surname>la Hoz Manotas</surname> <given-names>A. D.</given-names></name></person-group> (<year>2019</year>). <source>Estimation of Obesity Levels Based On Eating Habits and Physical Condition</source>. UCI Machine Learning Repository. <pub-id pub-id-type="doi">10.1016/j.dib.2019.104344</pub-id></citation></ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pham</surname> <given-names>Q.</given-names></name> <name><surname>Do</surname> <given-names>G.</given-names></name> <name><surname>Nguyen</surname> <given-names>H.</given-names></name> <name><surname>Nguyen</surname> <given-names>T.</given-names></name> <name><surname>Liu</surname> <given-names>C.</given-names></name> <name><surname>Sartipi</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Competesmoe-effective training of sparse mixture of experts via competition</article-title>. <source>arXiv preprint arXiv:2402.02526</source>.</citation>
</ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Piwko</surname> <given-names>J.</given-names></name> <name><surname>Ruci&#x00144;ski</surname> <given-names>J.</given-names></name> <name><surname>P&#x00142;udowski</surname> <given-names>D.</given-names></name> <name><surname>Zajko</surname> <given-names>A.</given-names></name> <name><surname>&#x0017B;ak</surname> <given-names>P.</given-names></name> <name><surname>Zacharecki</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Divide, specialize, and route: a new approach to efficient ensemble learning</article-title>. <source>arXiv preprint arXiv:2506.20814</source>.</citation>
</ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Quinlan</surname> <given-names>R.</given-names></name></person-group> (<year>1993</year>). <source>Auto MPG</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rezaee</surname> <given-names>M. J.</given-names></name> <name><surname>Eshkevari</surname> <given-names>M.</given-names></name> <name><surname>Saberi</surname> <given-names>M.</given-names></name> <name><surname>Hussain</surname> <given-names>O.</given-names></name></person-group> (<year>2021</year>). <article-title>Gbk-means clustering algorithm: an improvement to the k-means algorithm based on the bargaining game</article-title>. <source>Knowl. Based Syst</source>. <volume>213</volume>:<fpage>106672</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2020.106672</pub-id></citation>
</ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sathishkumar</surname> <given-names>V. E.</given-names></name> <name><surname>Cho</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <source>Seoul Bike Sharing Demand</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schlimmer</surname> <given-names>J.</given-names></name></person-group> (<year>1987</year>). <source>Automobile</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Mirhoseini</surname> <given-names>A.</given-names></name> <name><surname>Maziarz</surname> <given-names>K.</given-names></name> <name><surname>Davis</surname> <given-names>A.</given-names></name> <name><surname>Le</surname> <given-names>Q.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Outrageously large neural networks: the sparsely-gated mixture-of-experts layer</article-title>. <source>arXiv preprint arXiv:1701.06538</source>.</citation>
</ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tfekci</surname> <given-names>P.</given-names></name> <name><surname>Kaya</surname> <given-names>H.</given-names></name></person-group> (<year>2014</year>). <source>Combined Cycle Power Plant</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tresp</surname> <given-names>V.</given-names></name></person-group> (<year>2000</year>). <article-title>&#x0201C;Mixtures of Gaussian processes,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, 13.</citation>
</ref>
<ref id="B51">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tsanas</surname> <given-names>A.</given-names></name> <name><surname>Little</surname> <given-names>M.</given-names></name></person-group> (<year>2009</year>). <source>Parkinsons Telemonitoring</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tsanas</surname> <given-names>A.</given-names></name> <name><surname>Xifara</surname> <given-names>A.</given-names></name></person-group> (<year>2012</year>). <source>Energy Efficiency</source>. UCI Machine Learning Repository.:</citation>
</ref>
<ref id="B53">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ueda</surname> <given-names>N.</given-names></name> <name><surname>Ghahramani</surname> <given-names>Z.</given-names></name></person-group> (<year>2002</year>). <article-title>Bayesian model search for mixture models based on optimizing variational bounds</article-title>. <source>Neural Netw</source>. <volume>15</volume>, <fpage>1223</fpage>&#x02013;<lpage>1241</lpage>. <pub-id pub-id-type="doi">10.1016/S0893-6080(02)00040-0</pub-id></citation>
</ref>
<ref id="B54">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ukorigho</surname> <given-names>O.</given-names></name> <name><surname>Owoyele</surname> <given-names>O.</given-names></name></person-group> (<year>2025</year>). A competitive learning approach for specialized models: an approach to modelling complex physical systems with distinct functional regimes,&#x0201D; in <italic>Proceedings A</italic> (The Royal Society), <fpage>20240124</fpage>. <pub-id pub-id-type="doi">10.1098/rspa.2024.0124</pub-id></citation>
</ref>
<ref id="B55">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Waterhouse</surname> <given-names>S.</given-names></name> <name><surname>MacKay</surname> <given-names>D.</given-names></name> <name><surname>Robinson</surname> <given-names>A.</given-names></name></person-group> (<year>1995</year>). <article-title>&#x0201C;Bayesian methods for mixtures of experts,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, 8.</citation>
</ref>
<ref id="B56">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Weigend</surname> <given-names>A. S.</given-names></name> <name><surname>Mangeas</surname> <given-names>M.</given-names></name> <name><surname>Srivastava</surname> <given-names>A. N.</given-names></name></person-group> (<year>1995</year>). <article-title>Nonlinear gated experts for time series: discovering regimes and avoiding overfitting</article-title>. <source>Int. J. Neural Syst</source>. <volume>6</volume>, <fpage>373</fpage>&#x02013;<lpage>399</lpage>. <pub-id pub-id-type="doi">10.1142/S0129065795000251</pub-id></citation>
</ref>
<ref id="B57">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wolberg</surname> <given-names>W.</given-names></name> <name><surname>Street</surname> <given-names>W.</given-names></name> <name><surname>Mangasarian</surname> <given-names>O.</given-names></name></person-group> (<year>1995</year>). <source>Breast Cancer Wisconsin (Prognostic)</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B58">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>S.</given-names></name> <name><surname>Liew</surname> <given-names>A.-C.</given-names></name> <name><surname>Yan</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>M.</given-names></name></person-group> (<year>2004</year>). <article-title>Cluster analysis of gene expression data based on self-splitting and merging competitive learning</article-title>. <source>IEEE Trans. Inf. Technol. Biomed</source>. <volume>8</volume>, <fpage>5</fpage>&#x02013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1109/TITB.2004.824724</pub-id></citation>
</ref>
<ref id="B59">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>L.</given-names></name> <name><surname>Jordan</surname> <given-names>M.</given-names></name> <name><surname>Hinton</surname> <given-names>G. E.</given-names></name></person-group> (<year>1994</year>). <article-title>&#x0201C;An alternative model for mixtures of experts,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, 7.</citation>
</ref>
<ref id="B60">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yeh</surname> <given-names>I.-C.</given-names></name></person-group> (<year>2007</year>). <source>Concrete Compressive Strength</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B61">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yeh</surname> <given-names>I.-C.</given-names></name></person-group> (<year>2018</year>). <source>Real Estate Valuation</source>. UCI Machine Learning Repository.</citation>
</ref>
<ref id="B62">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yuan</surname> <given-names>C.</given-names></name> <name><surname>Neubauer</surname> <given-names>C.</given-names></name></person-group> (<year>2008</year>). <article-title>&#x0201C;Variational mixture of gaussian process experts,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, 21.</citation>
</ref>
<ref id="B63">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yuksel</surname> <given-names>S. E.</given-names></name> <name><surname>Wilson</surname> <given-names>J. N.</given-names></name> <name><surname>Gader</surname> <given-names>P. D.</given-names></name></person-group> (<year>2012</year>). <article-title>Twenty years of mixture of experts</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <volume>23</volume>, <fpage>1177</fpage>&#x02013;<lpage>1193</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2012.2200299</pub-id><pub-id pub-id-type="pmid">24807516</pub-id></citation></ref>
<ref id="B64">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.-J.</given-names></name> <name><surname>Liu</surname> <given-names>Z.-Q.</given-names></name></person-group> (<year>2002</year>). <article-title>Self-splitting competitive learning: a new on-line clustering paradigm</article-title>. <source>IEEE Trans. Neural Netw</source>. <volume>13</volume>, <fpage>369</fpage>&#x02013;<lpage>380</lpage>. <pub-id pub-id-type="doi">10.1109/72.991422</pub-id><pub-id pub-id-type="pmid">18244438</pub-id></citation></ref>
</ref-list>
</back>
</article>