<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2026.1649239</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Improvements to dark experience replay and reservoir sampling for better balance between consolidation and plasticity</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Kobayashi</surname> <given-names>Taisuke</given-names></name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<uri xlink:href="https://loop.frontiersin.org/people/1821690"/>
</contrib>
</contrib-group>
<aff id="aff1"><institution>National Institute of Informatics (NII), The Graduate University for Advanced Studies (SOKENDAI)</institution>, <city>Tokyo</city>, <country country="jp">Japan</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Taisuke Kobayashi, <email xlink:href="mailto:kobayashi@nii.ac.jp">kobayashi@nii.ac.jp</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-19">
<day>19</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>9</volume>
<elocation-id>1649239</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>25</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Kobayashi.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Kobayashi</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-19">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Continual learning is one of the most essential abilities for autonomous agents, which can incrementally learn daily-life skills even with limited computer resources. To achieve this goal, a simple yet powerful method called dark experience replay (DER) was recently proposed. DER mitigates catastrophic forgetting, where the skills acquired in the past are unintentionally forgotten when learning new skills, by stochastically storing streaming data in a reservoir sampling (RS) buffer and relearning them or retaining their past outputs. However, because DER considers multiple objectives, it does not function properly without appropriate weighting for each problem. In addition, the ability to retain past outputs inhibits learning if past outputs are inconsistent owing to distribution shifts or other effects. This is because of the trade-off between memory consolidation and plasticity. The trade-off is hidden even in the RS buffer, which gradually stops storing new data for new skills as data are continuously passed to it. To alleviate this trade-off and achieve a better balance, this study proposes improvement strategies for each DER and RS. Specifically, DER is improved by the automatic adaptation of weights, blocking of replaying inconsistent data, and correction of past outputs. RS is also improved with the generalization of acceptance probability, stratification of multiple buffers, and intentional omission of inconsistent data. These improvements were verified using multiple benchmarks including regression, classification, and reinforcement learning problems. Consequently, the proposed methods achieved a steady improvement in learning performance by balancing memory consolidation and plasticity.</p></abstract>
<kwd-group>
<kwd>consolidation and plasticity</kwd>
<kwd>continual learning</kwd>
<kwd>dark experience replay</kwd>
<kwd>reinforcement learning</kwd>
<kwd>reservoir sampling</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>Research Organization of Information and Systems</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/100013328</institution-id>
</institution-wrap>
</funding-source>
</award-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was supported by a &#x0201C;Strategic Research Projects&#x0201D; grant from the Research Organization of Information and Systems (ROIS).</funding-statement>
</funding-group>
<counts>
<fig-count count="9"/>
<table-count count="5"/>
<equation-count count="36"/>
<ref-count count="61"/>
<page-count count="17"/>
<word-count count="11676"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>Introduction</title>
<p>Machine learning technologies have made remarkable progress in recent years (<xref ref-type="bibr" rid="B37">LeCun et al., 2015</xref>; <xref ref-type="bibr" rid="B54">Touvron et al., 2023</xref>; <xref ref-type="bibr" rid="B30">Kirillov et al., 2023</xref>), and the basic (self-)supervised learning framework typically relies on a large dataset prepared in advance. However, new data are constantly increasing over time, so new skills often arise in addition to those in the dataset used for training. Alternatively, owing to limited computational resources, it is not possible to have all huge datasets in memory or storage, inevitably missing several skills. The goal of continual learning (CL) (or lifelong learning) is to incrementally obtain new skills in a machine learning model, even in such situations (<xref ref-type="bibr" rid="B45">Parisi et al., 2019</xref>). In other words, CL must train the models using streaming data without a pre-prepared dataset and unlimited computational resources. Note that this is similar to reinforcement learning (RL) settings, but recent RL has improved performance by leveraging experience replay with a sufficiently large buffer, so RL systems with limited computational resources must handle CL.</p>
<p>In this CL problem setting, catastrophic forgetting (or catastrophic inference), in which previously obtained skills are forgotten when new skills are learned, is a major issue (<xref ref-type="bibr" rid="B41">McClelland et al., 1995</xref>). Therefore, the main objective of CL research is to alleviate this problem. Three major approaches have been proposed:</p>
<list list-type="bullet">
<list-item><p>Regularization: Among the parameters in the model (e.g., weights and biases of neural networks), the essential ones for representing past skills are selected (e.g., according to Fisher information), and then they are regularized to keep the current values (<xref ref-type="bibr" rid="B31">Kirkpatrick et al., 2017</xref>; <xref ref-type="bibr" rid="B16">Farajtabar et al., 2020</xref>). The remaining parameters are utilized to learn new skills. Alternatively, instead of regularizing the parameter space, regularization can be applied to the output space of the model to implicitly select and retain parameters important for representing past skills (<xref ref-type="bibr" rid="B53">Titsias et al., 2020</xref>; <xref ref-type="bibr" rid="B28">Khan and Swaroop, 2021</xref>).</p></list-item>
<list-item><p>Rehearsal: By storing past data in a finite-size buffer, the model is trained not only on streaming but also on replayed data, enabling it to retain past skills even when acquiring new ones, similar to standard machine learning with a dataset (<xref ref-type="bibr" rid="B11">Chrysakis and Moens, 2020</xref>; <xref ref-type="bibr" rid="B52">Sun et al., 2022</xref>). Alternatively, instead of using a finite-size buffer, a data-generative model can be additionally trained to generate pseudo-past data, which can be used for learning (<xref ref-type="bibr" rid="B50">Shin et al., 2017</xref>; <xref ref-type="bibr" rid="B46">Pomponi et al., 2023</xref>).</p></list-item>
<list-item><p>Modularization: Each skill is learned mainly in its corresponding module within a well-designed model, and the modules are prevented from learning other skills by restricting (especially, freezing) their updates (<xref ref-type="bibr" rid="B33">Kobayashi and Sugino, 2020</xref>; <xref ref-type="bibr" rid="B27">Kang et al., 2022</xref>). The number of representable skills can be increased by adding modules to the model as needed (<xref ref-type="bibr" rid="B38">Li et al., 2019</xref>; <xref ref-type="bibr" rid="B44">Ostapenko et al., 2021</xref>).</p></list-item>
</list>
<p>CL methods that combine these approaches have also been proposed (<xref ref-type="bibr" rid="B8">Buzzega et al., 2020</xref>; <xref ref-type="bibr" rid="B14">Daxberger et al., 2023</xref>). In addition to this categorization, CL methods can be classified according to whether each data has a label that explicitly represents the corresponding skill, task, or class (with or without information about when the target skill changes) (<xref ref-type="bibr" rid="B3">Aljundi et al., 2019b</xref>; <xref ref-type="bibr" rid="B58">Ye and Bors, 2022</xref>). Naturally, methods without label information are more general-purpose and realistic; however, it is well known that the difficulty of the problem increases significantly.</p>
<p>Among previous CL methods without label information, dark experience replay (DER) (more precisely, DER&#x0002B;&#x0002B; in the original study) has attracted attention as a simple yet powerful method (<xref ref-type="bibr" rid="B8">Buzzega et al., 2020</xref>). DER corresponds to a combination of rehearsal and regularization approaches. Specifically, DER stores past data and corresponding outputs from the model at that time in a buffer; and uses them to maintain the past skills and outputs in conjunction with learning new skills. Although DER has a simple implementation that does not require label information, it can significantly mitigate catastrophic forgetting.</p>
<p>However, behind the simple implementation, DER requires weighting for the simultaneous optimization of multiple objectives; therefore, it does not function properly without fine-tuning these weights for each problem. Indeed, while the original paper reported results on several benchmarks, DER used different weights across them. Furthermore, past outputs do not necessarily represent past skills (due to insufficient learning or distribution shifts); therefore, the model may attempt to preserve outputs that is inconsistent with the current situation. Additional regularization (<xref ref-type="bibr" rid="B61">Zhuo et al., 2023</xref>) and prioritized sampling (<xref ref-type="bibr" rid="B57">Wang et al., 2024</xref>) have been proposed as improvements to DER to reduce the impact of such errors, but they do not eliminate them and tend to make the learning process more conservative, thereby reducing plasticity, as described below.</p>
<p>The buffer used in DER is a reservoir sampling (RS) buffer (<xref ref-type="bibr" rid="B56">Vitter, 1985</xref>). While a first-in-first-out (FIFO) buffer, which is widely used in experience replay (<xref ref-type="bibr" rid="B39">Lin, 1992</xref>; <xref ref-type="bibr" rid="B23">Isele and Cosgun, 2018</xref>), always stores new data and discards the oldest data, the RS buffer stochastically stores new data and discards previously stored data. Thus, the RS buffer can be regarded as sampling a finite subset of data with uniform probability from among all the data seen so far, retaining a portion of the data corresponding to past skills. However, in other words, the older the data, the more opportunities they have to be included in the buffer, and the shorter the retention time for newer data. Although the buffer&#x00027;s utility can be increased by storing highly informative samples (<xref ref-type="bibr" rid="B52">Sun et al., 2022</xref>; <xref ref-type="bibr" rid="B4">Aljundi et al., 2019c</xref>; <xref ref-type="bibr" rid="B7">Brignac et al., 2023</xref>), this approach does not directly resolve this issue. Some approaches have been proposed to encourage the acceptance of new data by decaying the storage probability of older data (<xref ref-type="bibr" rid="B12">Cormode et al., 2009</xref>; <xref ref-type="bibr" rid="B43">Osborne et al., 2014</xref>), but excessive decay undermines the benefits of RS.</p>
<p>Thus, the conventional DER and RS methods used in this study have unresolved issues. In particular, perhaps because DER focuses on resolving catastrophic forgetting, it prioritizes the maintenance of past skills, resulting in a loss of plasticity for the efficient acquisition of new skills. This is related to a trade-off between consolidation<xref ref-type="fn" rid="fn0003"><sup>1</sup></xref> and plasticity, which is a well-known problem faced by humans (<xref ref-type="bibr" rid="B18">Frank and Benington, 2006</xref>; <xref ref-type="bibr" rid="B42">Mermillod et al., 2013</xref>). Recently, it was reported that the plasticity of memory should be reconsidered (<xref ref-type="bibr" rid="B15">Dohare et al., 2024</xref>). Therefore, the current trend in which only consolidation is prioritized may be inappropriate. For example, as the distribution shift problem (<xref ref-type="bibr" rid="B34">Koh et al., 2021</xref>) suggests, past skills are not always correct and must be updated appropriately as the situation changes. Studies seeking a better balance between consolidation and plasticity often follow an approach that combines two models or structures that are biased in one direction or another (<xref ref-type="bibr" rid="B26">Jung et al., 2023</xref>; <xref ref-type="bibr" rid="B29">Kim et al., 2023</xref>), which increases the computational cost.</p>
<p>Therefore, this study seeks to improve DER and RS to achieve a better balance between memory consolidation and plasticity without introducing additional models or structures for satisfying limited computational resources. As a first contribution, a novel method called A2ER is proposed by incorporating three strategies into DER. Specifically, the <italic>adaptation</italic> strategy enables auto-tuning of the weights of DER and appropriately balances the learning from new data, learning from past data, and preservation of past outputs. Next, the <italic>block</italic> strategy suppresses the replay frequency of past data, which is inconsistent with the current models and data-generative distributions, for preventing the consolidation of wrong or unnecessary skills. Finally, the <italic>correction</italic> strategy corrects past outputs to make them consistent with the current situation and to increase plasticity.</p>
<p>As a second contribution, a new method called O2S is proposed by incorporating three strategies into RS. Specifically, the <italic>q-logarithm</italic> strategy generalizes the acceptance probability of the data passed to the RS buffer allowing for the specification of the balance between consolidation and plasticity. Next, the <italic>plural</italic> strategy prepares multiple RS buffers connected in series, which gradually shift from highly plastic to highly consolidated. Finally, the <italic>omission</italic> strategy deletes inconsistent past data when migrating data between buffers, leaving more important data in long-term memory.</p>
<p>These improvements are numerically verified using multiple benchmarks. First, to demonstrate the effectiveness of A2ER, classification and regression tasks on CL settings are solved with a small buffer, showing that A2ER yields higher accuracy than before owing to an appropriate learning balance and improved plasticity. Similarly, RL tasks are accomplished efficiently without consolidating inconsistent value functions or past policies. Next, to demonstrate the effectiveness of O2S, the underlying <italic>q-logarithm</italic> strategy is verified that it can specify a balance between consolidation and plasticity in a classification task involving a single distribution shift. Finally, O2S achieves higher generalization performance in goal-conditioned RL robotic tasks while reducing the amount of data passed to the RS buffers.</p></sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and methods</title>
<sec>
<title>Preliminaries</title>
<sec>
<title>Continual learning</title>
<p>Let us consider the problem of CL, which is the subject of this study. First, an agent endlessly receives the input data <inline-formula><mml:math id="M1"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">X</mml:mi></mml:mrow><mml:mo>&#x02286;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">X</mml:mi></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula> (with <italic>t</italic> &#x0003D; 1, 2, &#x02026; as the time step) from the environment it interacts with. The corresponding output data <inline-formula><mml:math id="M2"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">Y</mml:mi></mml:mrow><mml:mo>&#x02286;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">Y</mml:mi></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula> are predicted. <italic>y</italic><sub><italic>t</italic></sub> may be provided, as in supervised learning, or estimated by bootstrapping from <italic>x</italic><sub><italic>t</italic></sub> and other variables, as in RL; however, the former is assumed here for simplicity. Using a function approximator [e.g., deep neural networks (<xref ref-type="bibr" rid="B37">LeCun et al., 2015</xref>)] with parameters &#x003B8;, the following minimization problem is solved:</p>
<disp-formula id="EQ1"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mo>*</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<disp-formula id="E2"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M5"><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:mrow><mml:mi mathvariant="script">X</mml:mi></mml:mrow><mml:mo>&#x021A6;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">Y</mml:mi></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula> denotes the function approximator to be optimized, and <inline-formula><mml:math id="M6"><mml:mi>g</mml:mi><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">Y</mml:mi></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:msup><mml:mo>&#x021A6;</mml:mo><mml:mrow><mml:mi mathvariant="script">Y</mml:mi></mml:mrow></mml:math></inline-formula> denotes a fixed mapping function (e.g., a sigmoid function). By setting an appropriate loss function <inline-formula><mml:math id="M7"><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:math></inline-formula>, <italic>y</italic>&#x02243;<italic>g</italic>(<italic>h</italic><sub>&#x003B8;</sub>(<italic>x</italic>)) can be obtained.</p>
<p>The difficulty of continual learning stems from the fact that <italic>t</italic> continues to increase and its maximum cannot be defined. In extreme cases, when <italic>t</italic> &#x02192; &#x0221E;, the aforementioned minimization problem cannot be numerically optimized. In addition, the data size of <inline-formula><mml:math id="M8"><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is limited by finite computational resources, particularly for embodied systems such as robots. Therefore, the FIFO buffer <inline-formula><mml:math id="M9"><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>FIFO</mml:mtext></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi><mml:mo>=</mml:mo><mml:mo class="qopname">min</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>FIFO</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> of finite size <italic>N</italic><sup>FIFO</sup>&#x02208;&#x02115; is often introduced, leading to the following surrogated minimization problem.</p>
<disp-formula id="EQ3"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mo>*</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>FIFO</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>This problem can be minimized using the stochastic gradient descent method (e.g., <xref ref-type="bibr" rid="B22">Ilboudo et al., 2023</xref>). In other words, a batch of data, the size of which is denoted by <italic>B</italic>, is randomly extracted from <italic>D</italic><sup>FIFO</sup>, and &#x003B8; is then updated using its gradient <inline-formula><mml:math id="M11"><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:mi>B</mml:mi><mml:msup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>B</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Although this solution enables optimization as in general deep learning, it discards past data from the optimization when <italic>t</italic>&#x0003E;<italic>N</italic><sup>FIFO</sup>. As <italic>t</italic> increases, the skills acquired from &#x003C4; &#x02264; min(0, <italic>t</italic>&#x02212;<italic>N</italic><sup>FIFO</sup>) are overwritten (unless the data in the FIFO buffer contain equivalent skills). This type of overwriting is known as catastrophic forgetting (<xref ref-type="bibr" rid="B41">McClelland et al., 1995</xref>).</p></sec>
<sec>
<title>Dark experience replay</title>
<p>Several approaches have been proposed to mitigate catastrophic forgetting. Among them, DER (<xref ref-type="bibr" rid="B8">Buzzega et al., 2020</xref>) (more precisely, DER&#x0002B;&#x0002B; in the original paper) is employed as a baseline of this study with slight modifications. DER is a simple yet powerful continual learning method that can be regarded as a combination of rehearsal and functional regularization. DER introduces the RS buffer (<xref ref-type="bibr" rid="B56">Vitter, 1985</xref>), <italic>D</italic><sup>RS</sup> (with size <italic>N</italic><sup>RS</sup>), to store past data that i) is passed in streaming format (the original implementation); or ii) overflow from the FIFO buffer (this paper&#x00027;s implementation). This RS buffer stochastically stores all the data seen so far with equal probability (see the next section), rather than storing the latest data first as in the FIFO buffer. In addition, feature <italic>z</italic><sub><italic>t</italic></sub> computed using <italic>h</italic><sub>&#x003B8;</sub> is stored in the RS buffer together with (<italic>x</italic><sub><italic>t</italic></sub>, <italic>y</italic><sub><italic>t</italic></sub>).</p>
<p>Under this design, the following minimization problem is solved:</p>
<disp-formula id="E4"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mo>*</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>FIFO</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="EQ5"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>where &#x003B2;&#x02208;[0, 1] is the coefficient that adjusts the learning priority between the FIFO and RS buffers, and &#x003B1;&#x02265;0 is the weight of the regularization term that preserve past features computed using the previous &#x003B8;. The second and third terms randomly select batches from <italic>D</italic><sup>RS</sup>. In the original implementation, each batch was selected independently; in this study, however, all data were selected simultaneously and split into two non-overlapping batches to strengthen the regularization of DER. This implementation can be regarded as a generalized version of CLEAR (<xref ref-type="bibr" rid="B47">Rolnick et al., 2019</xref>), which was originally designed for RL.</p></sec>
<sec>
<title>Reservoir sampling</title>
<p>As mentioned previously, the RS buffer used in DER stochastically stores all the data seen so far with equal probability (<xref ref-type="bibr" rid="B56">Vitter, 1985</xref>). Once the buffer is full, the following algorithm is applied to select the discarded data <italic>d</italic><sup>del</sup> based on the new data arriving <italic>n</italic>-th data point, <italic>d</italic>&#x02032;, and the existing entries <inline-formula><mml:math id="M14"><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msubsup></mml:math></inline-formula>.</p>
<disp-formula id="EQ6"><mml:math id="M15"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mi>k</mml:mi><mml:mo>~</mml:mo><mml:mi mathvariant='script'>U</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mtext>&#x000A0;</mml:mtext></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mi>d</mml:mi><mml:mrow><mml:mtext>del</mml:mtext></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msup><mml:mi>d</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x0003E;</mml:mo><mml:msup><mml:mi>N</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02264;</mml:mo><mml:mi>k</mml:mi><mml:mo>&#x02264;</mml:mo><mml:msup><mml:mi>N</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>where <inline-formula><mml:math id="M17"><mml:mrow><mml:mi mathvariant="script">U</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>u</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> denotes a discrete uniform distribution over integers in the interval [<italic>l, u</italic>], <italic>l, u</italic>&#x02208;&#x02124; and <italic>l</italic> &#x02264; <italic>u</italic>. When data are discarded from the buffer, the new data replace the corresponding index as <inline-formula><mml:math id="M18"><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>.</p>
<p>In the above algorithm, the probability of accepting the new data <italic>d</italic><sub><italic>n</italic></sub> on the <italic>n</italic>-th pass is given as follows:</p>
<disp-formula id="E7"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02264;</mml:mo><mml:mi>k</mml:mi><mml:mo>&#x02264;</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:mo>;</mml:mo><mml:mi>k</mml:mi><mml:mo>&#x0007E;</mml:mo><mml:mrow><mml:mi mathvariant="script">U</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="EQ8"><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p>where <italic>P</italic>(&#x000B7;) denotes the probability of satisfying the condition in parentheses. In addition, the probability that it remains in the RS buffer after <italic>n</italic>&#x02032; &#x0003D; 1, 2, &#x02026; additional steps is given by</p>
<disp-formula id="E9"><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02260;</mml:mo><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mtext>del</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x0220F;</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="EQ10"><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>These two probabilities show that the RS buffer stochastically stores all data seen so far with equal probability, inversely proportional to the total number of data points passed, <italic>n</italic>. Here, <italic>n</italic> is sometimes referred to as the reservoir counter (<xref ref-type="bibr" rid="B52">Sun et al., 2022</xref>).</p></sec>
<sec>
<title>Related work</title>
<p>In previous research on continual learning, various improvements have been proposed for rehearsal-based approaches, which involve deep engagement with DER and RS. For example, many methods (<xref ref-type="bibr" rid="B9">Buzzega et al., 2021</xref>; <xref ref-type="bibr" rid="B6">Boschini et al., 2022</xref>; <xref ref-type="bibr" rid="B10">Caccia et al., 2022</xref>; <xref ref-type="bibr" rid="B21">Harun et al., 2025</xref>) utilize class (or task) information to adjust the balance between data to be rehearsed and stored, or to redefine the loss function. However, such approaches are inappropriate for this study, which assumes a task-agnostic setting.</p>
<p>Without using task information, determining data value often requires significant additional computational cost. For example, <xref ref-type="bibr" rid="B4">Aljundi et al. (2019c</xref>),(<xref ref-type="bibr" rid="B2">a</xref>) evaluate high-importance data using gradient information or loss changes before and after updates. In either case, gradients must be computed separately from parameter learning, doubling computational cost. Before adding data to RS buffer, <xref ref-type="bibr" rid="B52">Sun et al. (2022)</xref> judges a new metric akin to information gain based on data novelty and learnability. However, similarly to the above, this computation also requires substantial redundant computation and models. Bayesian neural networks need to be introduced for estimating data uncertainty, another metric for data importance, taking high cost (<xref ref-type="bibr" rid="B57">Wang et al., 2024</xref>). Thus, while several metrics for selecting data to replay and store have been proposed, all incur substantial computational costs solely for value estimation, making them unsuitable for systems with limited computational resources.</p>
<p>Another development involves revising the regularization terms introduced in DER and proposing new regularization terms. For example, <xref ref-type="bibr" rid="B61">Zhuo et al. (2023)</xref> proposes the forward consistency loss, although its computation requires retaining past model parameters. In the recent theoretical work (<xref ref-type="bibr" rid="B14">Daxberger et al., 2023</xref>), a loss integrating rehearsal, function regularization, and parameter-space regularization has been derived. However, its implementation still requires additional weight adjustment and estimation for parameter-space regularization compared to DER. Thus, regularization terms are not easily added, and in practice, they increase fine-tuning effort. This makes them difficult to handle in this study, which assumes scenarios lacking domain knowledge.</p>
<p>From the above, it is evident that under the task-agnostic setting with limited computational resources as the premise assumed by this study, the development of DER and related methods is not well-suited. Therefore, the subsequent discussion will proceed with the standard DER as the baseline, constructing a methodology that satisfies the aforementioned premise while enabling flexible adjustment of the trade-off between memory consolidation and plasticity.</p>
</sec>
</sec>
<sec>
<title>Improvements of DER: A2ER</title>
<sec>
<title>Open issues in DER</title>
<p>DER is an effective method that can strongly mitigate catastrophic forgetting despite its simple implementation. However, the hyperparameters &#x003B2; and &#x003B1; in DER must be appropriately tuned, depending on the target problem, to achieve optimal performance. In fact, the original study (<xref ref-type="bibr" rid="B8">Buzzega et al., 2020</xref>) used different values for each benchmark. In other words, the balance between memory consolidation and plasticity must be determined manually and often requires considerable effort.</p>
<p>Additionally, the third term in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref>, which corresponds to functional regularization, attempts to preserve past features, even if they are inconsistent with the current situation. That is, the features may be valid only for past situations and not after distribution shifts. In that case, plasticity is more preferable than consolidation by the regularization.</p>
<p>To address these issues, this study proposes A2ER, which incorporates three strategies into DER (see <xref ref-type="fig" rid="F1">Figure 1</xref>): <italic>adaptation, block</italic>, and <italic>correction</italic>.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>A2ER with three strategies for enhancing DER.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a machine learning pipeline where streaming data enters a FIFO buffer, transitions to an RS buffer with uniform probability, and undergoes prioritized replay. Loss minimization and weight adaptation occur, with a process comparing current and previous feature vectors to trigger corrections if necessary.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<title>Adaptation of weights</title>
<p>Here, the <italic>adaptation</italic> strategy aims to automatically adjust &#x003B2; and &#x003B1; in DER, inspired by recent machine learning formulations (<xref ref-type="bibr" rid="B20">Haarnoja et al., 2018</xref>; <xref ref-type="bibr" rid="B51">Stooke et al., 2020</xref>). Specifically, the minimization problem in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref> can be reinterpreted as the minimization of <xref ref-type="disp-formula" rid="EQ3">Equation 2</xref> subject to the following equality constraints, which correspond to the second and third terms in DER:</p>
<disp-formula id="EQ11"><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>FIFO</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<disp-formula id="EQ12"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>where <inline-formula><mml:math id="M25"><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>/</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> for brevity, and &#x00394;<sub><italic>Q</italic></sub>&#x02265;0 denotes a threshold value for the average of &#x00394;<sub>&#x003C4;</sub>.</p>
<p>In this study, &#x00394;<sub><italic>Q</italic></sub> is heuristically defined as a variable updated at each step using the following quantile function <italic>Q</italic>:</p>
<disp-formula id="EQ13"><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mo>|</mml:mo><mml:mi>B</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mo>|</mml:mo><mml:mi>B</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>B</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:mi>&#x003C1;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>where &#x003C1;&#x02208;(0, 1) denotes the quantile level.</p>
<p>These equality constraints can be converted into regularization terms using Lagrange multipliers with &#x003B2; and &#x003B1;. By considering &#x00394;<sub><italic>Q</italic></sub> as a constant that is independent of &#x003B8;, this conversion is consistent with <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref>. Furthermore, &#x003B2; and &#x003B1; can be optimized to satisfy the equality constraints, yielding the following auto-tuning rules:</p>
<disp-formula id="E14"><mml:math id="M27"><mml:msup><mml:mi>&#x003B2;</mml:mi><mml:mo>&#x02217;</mml:mo></mml:msup><mml:mo>=</mml:mo><mml:mi>arg</mml:mi><mml:munder><mml:mrow><mml:mi>min</mml:mi></mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:munder><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:mo>&#x0007B;</mml:mo><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:msup><mml:mi>D</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo stretchy='false'>[</mml:mo><mml:mi>&#x02112;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>]</mml:mo></mml:math></disp-formula>
<disp-formula id="EQ15"><mml:math id="M28"><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:msup><mml:mi>D</mml:mi><mml:mrow><mml:mtext>FIFO</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo stretchy='false'>[</mml:mo><mml:mi>&#x02112;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>]</mml:mo><mml:mo>&#x0007D;</mml:mo></mml:math><label>(10)</label></disp-formula>
<disp-formula id="EQ16"><mml:math id="M29"><mml:msup><mml:mi>&#x003B1;</mml:mi><mml:mo>&#x02217;</mml:mo></mml:msup><mml:mo>=</mml:mo><mml:mi>arg</mml:mi><mml:munder><mml:mrow><mml:mi>min</mml:mi></mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:munder><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:msup><mml:mi>D</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>&#x00394;</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mi>&#x00394;</mml:mi><mml:mi>Q</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math><label>(11)</label></disp-formula>
<p>These are also solved by stochastic gradient descent, together with the minimization problem in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref>. Note that although the Lagrange multipliers (i.e., &#x003B2; and &#x003B1; in this case) are originally real numbers, their respective domains are restricted to &#x003B2;&#x02208;[0, 1] and &#x003B1;&#x02265;0. These constraints can be enforced using sigmoid and softplus functions, respectively. Even with these transformations, &#x003B2; and &#x003B1; can be efficiently optimized using the mirror descent method (<xref ref-type="bibr" rid="B5">Beck and Teboulle, 2003</xref>).</p>
<p>With the above formulation, &#x003B2; related to the first constraint brings about a proper balance between consolidation and plasticity by ensuring that the latest data in the FIFO buffer and the past data in the RS buffer incur similar levels of loss. The second constraint, governed by &#x003B1;, empirically strengthens consolidation by suppressing excessive changes in features, and increases plasticity by reducing functional regularization after sufficient consolidation. Although &#x003C1; is added for computing the threshold &#x00394;<sub><italic>Q</italic></sub>, its tuning is task-agnostic and robust (see the experimental results later).</p></sec>
<sec>
<title>Block of replays and correction of features</title>
<p>In the second equality constraint, past data that exceed &#x00394;<sub><italic>Q</italic></sub> are classified as either inconsistent owing to distribution shifts or as having non-optimized features owing to insufficient learning. Therefore, the <italic>block</italic> strategy restricts the replay probability of such inconsistent data with the current situation, similar to intentional forgetting (<xref ref-type="bibr" rid="B24">Johnson, 1994</xref>), whereas the <italic>correction</italic> strategy updates the features in the buffer if the error is caused by a lack of learning similar to memory engram updating (<xref ref-type="bibr" rid="B25">Josselyn and Tonegawa, 2020</xref>). Because these strategies share many underlying processes, they are introduced together in this section.</p>
<p>First, it is assumed that the more &#x00394;<sub>&#x003C4;</sub> deviates from &#x00394;<sub><italic>Q</italic></sub>, the more the features are considered inconsistent (or non-optimized). Therefore, the following quantity &#x003B7;<sub>&#x003C4;</sub>&#x02208;[0, 1] is designed to be proportional to the degree of deviation.</p>
<disp-formula id="EQ17"><mml:math id="M30"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mtext>clip</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<disp-formula id="EQ18"><mml:math id="M31"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x003C1;</mml:mi></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<p>where clip(<italic>x</italic>; <italic>l, u</italic>) is a function that clips <italic>x</italic> to the interval [<italic>l, u</italic>], where <italic>l, u</italic>&#x02208;&#x0211D; and <italic>l</italic> &#x02264; <italic>u</italic>. <inline-formula><mml:math id="M32"><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> serves as an approximate empirical estimate of the maximum &#x00394;<sub>&#x003C4;</sub>.</p>
<p>Using the calculated &#x003B7;<sub>&#x003C4;</sub>, the following update of &#x00394;<sub>&#x003C4;</sub> is considered (by correcting <italic>z</italic><sub>&#x003C4;</sub>):</p>
<disp-formula id="EQ19"><mml:math id="M33"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02190;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<p>With this design, &#x00394;<sub>&#x003C4;</sub> in the range <inline-formula><mml:math id="M34"><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02264;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02264;</mml:mo><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> forms a quadratic curve with a maximum at &#x003B7;<sub>&#x003C4;</sub> &#x0003D; 1/2, and returns to &#x00394;<sub><italic>Q</italic></sub> at &#x003B7;<sub>&#x003C4;</sub> &#x0003D; 0, 1, as illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. That is, up to the midpoint, &#x00394;<sub>&#x003C4;</sub> is expected to be minimized through the auto-tuning of &#x003B1;; after that, the behavior shifts to the correction of <italic>z</italic><sub>&#x003C4;</sub>.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Updated &#x00394;<sub>&#x003C4;</sub> using &#x003B7;<sub>&#x003C4;</sub>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0002.tif">
<alt-text content-type="machine-generated">Line graph with two curves shows updated value versus error norm delta sub tau. One curve is a dashed diagonal line indicating a linear relationship, while the other is a green curve that rises above, flattens, and then becomes horizontal. Vertical and horizontal dashed lines mark the points delta sub Q and bar delta sub Q.</alt-text>
</graphic>
</fig>
<p>The correction rate &#x003B3;<sub>&#x003C4;</sub>&#x02208;[0, 1] of <italic>z</italic><sub>&#x003C4;</sub> toward <italic>h</italic><sub>&#x003B8;</sub>(<italic>x</italic><sub>&#x003C4;</sub>) (i.e., the current features) is defined as follows:</p>
<disp-formula id="E20"><mml:math id="M35"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="EQ21"><mml:math id="M36"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>&#x02234;</mml:mo></mml:mtd><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<p>The larger &#x003B3;<sub>&#x003C4;</sub> is, the more inconsistent past data are. The <italic>block</italic> strategy suppresses the use of past data themselves for replay and learning. Specifically, although the expectation over <italic>D</italic><sup>RS</sup> in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref> normally replays all data with uniform probability <inline-formula><mml:math id="M37"><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>/</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:math></inline-formula>, each probability is instead weighted as follows:</p>
<disp-formula id="EQ22"><mml:math id="M38"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<disp-formula id="E23"><mml:math id="M39"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mtd><mml:mtd><mml:mo>&#x02190;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003BB;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003BB;&#x02208;(0, 1) denotes the hyperparameter for the exponential moving average of 1&#x02212;&#x003B3;<sub>&#x003C4;</sub>. That is, if the error norm does not become sufficiently small even after repeated learning and correction, the data will be gradually blocked for replay. Note that &#x003BB; would be of low importance because it works well even when set relatively large (i.e., slight smoothing effect).</p>
<p>In the <italic>correction</italic> strategy <italic>z</italic><sub>&#x003C4;</sub> in the buffer is updated using &#x003B3;<sub>&#x003C4;</sub> as follows:</p>
<disp-formula id="EQ24"><mml:math id="M40"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02190;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula>
<p>The minimization problem in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref> can be solved after this correction. In practice, this can be done by appropriately compensating the loss function associated with <italic>z</italic><sub>&#x003C4;</sub>. That is, &#x00394;<sub>&#x003C4;</sub>&#x02212;&#x00394;<sub><italic>Q</italic></sub> in the update rule for &#x003B1; in <xref ref-type="disp-formula" rid="EQ16">Equation 11</xref> is multiplied by &#x003B7;<sub>&#x003C4;</sub>, and &#x00394;<sub>&#x003C4;</sub> in the update rule for &#x003B8; in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref> is multiplied by <inline-formula><mml:math id="M41"><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. In this way, additional computations can be avoided by reusing values already stored in RAM.</p>
</sec>
</sec>
<sec>
<title>Improvements of RS: O2S</title>
<sec>
<title>Open issues in RS</title>
<p>The RS buffer used in DER accepts and stores data with a probability inversely proportional to the reservoir counter <italic>n</italic>, as expressed in <xref ref-type="disp-formula" rid="EQ5">Equations 5</xref> and <xref ref-type="disp-formula" rid="EQ6">6</xref>. In other words, when <italic>n</italic> becomes very large, new data are rarely accepted into the RS buffer; however, if accepted once, they can be stored for a long time. This characteristic is effective in preventing past data from being discarded and in improving consolidation, while it impairs plasticity needed to adapt to distributional shifts.</p>
<p>In addition, data have varying degrees of importance, as suggested by the <italic>block</italic> strategy, and it is undesirable to pass inconsistent data to the RS buffer, as noted in previous work (<xref ref-type="bibr" rid="B52">Sun et al., 2022</xref>). In other words, if inconsistent data are stored in the RS buffer, they may impede learning. Furthermore, simply passing such data to the RS buffer increases <italic>n</italic>, which reduces the acceptance rate for more important data that may arrive later.</p>
<p>To resolve these issues, this study proposes O2S, which introduces three strategies to enhance RS (see <xref ref-type="fig" rid="F3">Figure 3</xref>), <italic>q-logarithm, plural</italic>, and <italic>omissions</italic>.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>O2S with three strategies for improving RS.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0003.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a data buffering process with streaming data entering a FIFO buffer, followed by a random sampling buffer with short-term memory, and then a buffer with long-term memory, with the option for omission if unnecessary. An inset line graph shows acceptance probability versus number of data passed, comparing traditional and q-logarithm-based designs. Multiple buffers with different balances are noted.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<title>Generalization of acceptance rate using q-logarithm</title>
<p>First, the acceptance rate is generalized to make a balance between consolidation and plasticity adjustable. Specifically, a monotonically nondecreasing transformation <italic>f</italic>:&#x02124;&#x021A6;&#x02124; is designed to generate random numbers for <italic>n</italic>&#x0003E;<italic>N</italic><sup>RS</sup>, such that sampling in <xref ref-type="disp-formula" rid="EQ6">Equation 4</xref> becomes <inline-formula><mml:math id="M42"><mml:mi>k</mml:mi><mml:mo>&#x0007E;</mml:mo><mml:mrow><mml:mi mathvariant="script">U</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Note that <italic>f</italic> is required to be monotonically nondecreasing because it is also interpreted as a counter. The acceptance rate of the <italic>n</italic>-th new data point <italic>d</italic><sub><italic>n</italic></sub> is given as follows:</p>
<disp-formula id="EQ25"><mml:math id="M43"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(18)</label></disp-formula>
<p>To satisfy the definition of probability and maintain continuity with the case <italic>n</italic> &#x02264; <italic>N</italic><sup>RS</sup>, where the acceptance rate is 1, it must hold that <inline-formula><mml:math id="M44"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">lim</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x02192;</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:math></inline-formula>.</p>
<p>Considering the probability that <italic>d</italic><sub><italic>n</italic></sub> remains in the RS buffer at time <italic>n</italic>&#x0002B;<italic>n</italic>&#x02032;-th, one term cannot be canceled out, although the derivation procedure is similar to <xref ref-type="disp-formula" rid="EQ10">Equation 6</xref>.</p>
<disp-formula id="E26"><mml:math id="M45"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02260;</mml:mo><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mtext>del</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02003;</mml:mtext><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="EQ27"><mml:math id="M46"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x0220F;</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(19)</label></disp-formula>
<p>Here, the first term corresponds to the conventional case with <italic>f</italic>(<italic>n</italic>) &#x0003D; <italic>n</italic>, whereas the second term&#x02014;that is, the total product operation&#x02014;modifies it.</p>
<p>Even with such modifications, the definition of probability must still be satisfied. To this end, the condition for designing <italic>f</italic> can be derived by focusing on the inner part of the second term.</p>
<disp-formula id="E28"><mml:math id="M47"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>&#x00394;</mml:mi><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="EQ29"><mml:math id="M48"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mi>&#x00394;</mml:mi><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(20)</label></disp-formula>
<p>where &#x00394;<italic>f</italic>(<italic>n</italic>) &#x0003D; <italic>f</italic>(<italic>n</italic>)&#x02212;<italic>f</italic>(<italic>n</italic>&#x02212;1). If this term ever exceeds 1, then the resulting probability may also exceed 1, violating the definition of probability. Therefore, the sufficient condition &#x00394;<italic>f</italic>(<italic>n</italic>) &#x02264; 1 must be satisfied.</p>
<p>Thus, the generalized counter <italic>f</italic>(<italic>n</italic>) must satisfy the following conditions:</p>
<disp-formula id="EQ30"><mml:math id="M49"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">lim</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x02192;</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:munder></mml:mstyle><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn><mml:mo>&#x02264;</mml:mo><mml:mi>&#x00394;</mml:mi><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02264;</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(21)</label></disp-formula>
<p>When &#x00394;<italic>f</italic>(<italic>n</italic>) &#x0003D; 1, new and past data are equally likely to be included in the buffer after each update, promoting the storage of past data and supporting consolidation. When &#x00394;<italic>f</italic>(<italic>n</italic>) &#x0003D; 0, the probability of past data remaining in the buffer decays exponentially because of the cumulative product, as in the case where <italic>f</italic>(<italic>n</italic>) &#x0003D; <italic>c</italic> at convergence. This facilitates the acceptance of new data and enhances plasticity. In other words, if &#x00394;<italic>f</italic>(<italic>n</italic>) can be specified and adjusted to a suitable degree, it becomes possible to balance between consolidation and plasticity.</p>
<p>There are several possible candidates for functions that satisfy these conditions, but this study introduces the following <italic>q</italic>-logarithmic function (<xref ref-type="bibr" rid="B55">Tsallis, 1988</xref>), referred to as the <italic>q-logarithm</italic> strategy:</p>
<disp-formula id="EQ31"><mml:math id="M50"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mi>f</mml:mi><mml:mi>q</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mi>min</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mi>N</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mo>+</mml:mo><mml:mrow><mml:mo>&#x0230A;</mml:mo><mml:mrow><mml:msup><mml:mi>N</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:msub><mml:mrow><mml:mi>ln</mml:mi></mml:mrow><mml:mi>q</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>max</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x02212;</mml:mo><mml:msup><mml:mi>N</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mi>N</mml:mi><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>&#x0230B;</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>ln</mml:mi><mml:mi>q</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mi>ln</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mi>q</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mfrac><mml:mrow><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:mi>q</mml:mi></mml:mrow></mml:msup><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:mi>q</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mi>q</mml:mi><mml:mo>&#x02260;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(22)</label></disp-formula>
<p>where <italic>q</italic>&#x02208;[0, 2] is a hyperparameter that balances consolidation and plasticity.</p>
<p><italic>f</italic><sub><italic>q</italic></sub>(<italic>n</italic>) is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. As shown in the figure, for <italic>q</italic><sub>1</sub>&#x0003E;<italic>q</italic><sub>2</sub>, <italic>f</italic><sub><italic>q</italic><sub>1</sub></sub>(<italic>n</italic>) &#x02264; <italic>f</italic><sub><italic>q</italic><sub>2</sub></sub>(<italic>n</italic>) holds. For <italic>q</italic> &#x0003D; 0, <italic>f</italic><sub><italic>q</italic></sub>(<italic>n</italic>) reduces to the conventional RS with <italic>f</italic><sub><italic>q</italic> &#x0003D; 0</sub>(<italic>n</italic>) &#x0003D; <italic>n</italic>, which yields the highest consolidation. For 0 &#x02264; <italic>q</italic> &#x02264; 1, <inline-formula><mml:math id="M53"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">lim</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x02192;</mml:mo><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x0221E;</mml:mi></mml:math></inline-formula>, meaning the buffer eventually stops accepting new data and storing past data, although there is a time lag before convergence. On the other hand, for <italic>q</italic>&#x0003E;1, an upper bound exists, and the function converges to <inline-formula><mml:math id="M54"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">lim</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x02192;</mml:mo><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msup><mml:mo>/</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>q</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Therefore, new data are accepted finally with constant probability and past data decay exponentially, increasing plasticity. Note that although <italic>q</italic> &#x02192; &#x0221E; is theoretically valid, the acceptance rate of new data converges to 0.5 with <italic>q</italic> &#x0003D; 2, which offers sufficient plasticity. This study therefore restricts <italic>q</italic> to the interval [0, 2], as previously defined.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Example of <italic>f</italic><sub><italic>q</italic></sub>(<italic>n</italic>) with <italic>N</italic><sup>RS</sup> &#x0003D; 100.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0004.tif">
<alt-text content-type="machine-generated">Line graph illustrating transformed counter values fq(n) against raw counter n for different q values ranging from 0.25 to 2.0, with each q represented by a distinct colored line. Dashed diagonal, horizontal, and vertical lines emphasize reference points.</alt-text>
</graphic>
</fig>
<p>Thus, a small <italic>q</italic> increases consolidation and a large <italic>q</italic> increases plasticity, allowing continuous adjustment of the balance between them. Other possible simple designs are examined in <xref ref-type="supplementary-material" rid="SM1">Appendix 4</xref>.</p></sec>
<sec>
<title>Plural buffers and omission of data passing</title>
<p>Although the designed counter <italic>f</italic><sub><italic>q</italic></sub> can adjust the balance between consolidation and plasticity, the trade-off itself is not eliminated and may lead to suboptimal performance unless it is appropriately tuned for the target problem. To alleviate this limitation, the <italic>plural</italic> strategy introduces a layered structure with multiple RS buffers configured differently, similar to human short- and long-term memory systems (<xref ref-type="bibr" rid="B13">Cowan, 2008</xref>). Furthermore, when passing data between multiple RS buffers, the importance of each data point is calculated in terms of sampling priority. Therefore, the <italic>omission</italic> strategy uses this information to determine whether the data should be passed to the next buffer.</p>
<p>First, the <italic>plural</italic> strategy prepares <italic>L</italic>&#x02208;&#x02115; layers of serially connected RS buffers <inline-formula><mml:math id="M55"><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>L</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> with respective sizes <inline-formula><mml:math id="M56"><mml:msubsup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>RS</mml:mtext></mml:mrow></mml:msubsup></mml:math></inline-formula>. Each has its own counter <italic>n</italic><sub><italic>l</italic></sub> and balance <italic>q</italic><sub><italic>l</italic></sub> for <italic>f</italic><sub><italic>q</italic></sub> in <xref ref-type="disp-formula" rid="EQ31">Equation 22</xref>. Here, the FIFO buffer is regarded as a special buffer at <italic>l</italic> &#x0003D; 0. Data processing is shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. When the past data in the <italic>l</italic>&#x02212;1-th buffer are discarded, they are passed to the <italic>l</italic>-th buffer. When new data passed to the <italic>l</italic>&#x02212;1-th buffer are not accepted, they are not passed to the <italic>l</italic>-th buffer. If <italic>q</italic><sub><italic>l</italic></sub> is small and consolidation is prioritized in the shallow layers of <italic>l</italic>&#x02243;1, the buffer rarely discards past data, thereby preventing the flow to subsequent layers. Therefore, for <italic>l</italic><sub>1</sub>&#x0003C;<italic>l</italic><sub>2</sub>, it is desirable that <italic>q</italic><sub><italic>l</italic><sub>1</sub></sub>&#x02265;<italic>q</italic><sub><italic>l</italic><sub>2</sub></sub>. In this way, buffers in deeper layers of <italic>l</italic>&#x02243;<italic>L</italic> will have a longer time scale because new data will be passed to them less frequently. Note that the batch for training in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref> is constructed as the sum of sub-batches (each of size |<italic>B</italic>|/<italic>L</italic>) sampled from all RS buffers.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Data processing in <italic>plural</italic> strategy with <italic>L</italic> RS buffers.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0005.tif">
<alt-text content-type="machine-generated">Flowchart illustrating sequential steps of inserting new data into a FIFO buffer, checking for buffer fullness, accepting or discarding data, progressing through additional buffers, and handling terminations or data passes based on logical conditions.</alt-text>
</graphic>
</fig>
<p>Second, the <italic>omission</italic> strategy intercepts the data passing from the <italic>l</italic>&#x02212;1-th to the <italic>l</italic>-th buffers. As previously mentioned, each data point has an importance value represented by the replay priority <inline-formula><mml:math id="M57"><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula>. In other words, data that have already been, or will be, blocked from replay tend to waste buffer capacity; therefore, the buffer should be fully utilized by discarding this data before advancing its counter or passing it to the next buffer. By converting <inline-formula><mml:math id="M58"><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> into rejection probabilities for each buffer <inline-formula><mml:math id="M59"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup></mml:math></inline-formula>, a data point at time &#x003C4; is passed to the <italic>l</italic>-th buffer as <inline-formula><mml:math id="M60"><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> with the following rejection probability <italic>p</italic><sup>rej</sup>:</p>
<disp-formula id="EQ34"><mml:math id="M61"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(23)</label></disp-formula>
<disp-formula id="E35"><mml:math id="M62"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>max</mml:mtext></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>max</mml:mtext></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>min</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003BD;</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M63"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>max</mml:mtext></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="M64"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>min</mml:mtext></mml:mrow></mml:msubsup></mml:math></inline-formula> denote the maximum and minimum values in either the <italic>l</italic>&#x02212;1- or <italic>l</italic>-th buffer, and <inline-formula><mml:math id="M65"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the replay priority for the current data. &#x003BD;&#x02265;0 adjusts the rejection rate; however, it is difficult to tune intuitively. Instead, this paper specifies the rejection probability &#x003B6;&#x02208;[0, 1], setting <inline-formula><mml:math id="M66"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:msup><mml:mrow><mml:mn>5</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x003BD;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> (for data with intermediate priority), which yields <inline-formula><mml:math id="M67"><mml:mi>&#x003BD;</mml:mi><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mo class="qopname">ln</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msqrt><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B6;</mml:mi></mml:mrow></mml:msqrt></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>/</mml:mo><mml:mo class="qopname">ln</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>2</mml:mn></mml:math></inline-formula>. Note that because the FIFO buffer does not have <inline-formula><mml:math id="M68"><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula>, the data passed from <italic>l</italic> &#x0003D; 0 to <italic>l</italic> &#x0003D; 1 are always accepted, with <inline-formula><mml:math id="M69"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>. In addition, for numerical stability, <inline-formula><mml:math id="M70"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mrow><mml:mtext>rej</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula> if <inline-formula><mml:math id="M71"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mrow><mml:mtext>max</mml:mtext></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mrow><mml:mtext>min</mml:mtext></mml:mrow></mml:msubsup></mml:math></inline-formula> is less than &#x003F5; (in this study, 10<sup>&#x02212;5</sup>).</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec>
<title>Common setup</title>
<p>Multiple numerical benchmarks were used to validate the effectiveness of the proposed A2ER and O2S methods, the pseudo-codes of which are summarized in <xref ref-type="supplementary-material" rid="SM1">Appendix 1</xref>. The basic hyperparameter settings were the same across all benchmarks to confirm the task-agnostic design. Specifically, following the settings provided in DER (<xref ref-type="bibr" rid="B8">Buzzega et al., 2020</xref>), the FIFO buffer size, <italic>N</italic><sup>FIFO</sup>, was set to 512, and the RS buffer size, <italic>N</italic><sup>RS</sup>, was also 512 (in total, even when the <italic>plural</italic> strategy was applied). The batch size, the number of data replayed from each buffer at once, was set to |<italic>B</italic>| &#x0003D; 32. The initial value of &#x003B1; in <xref ref-type="disp-formula" rid="EQ5">Equation 3</xref> was set to 1, and the initial value of &#x003B2; was 0.5. These are the most natural values in the absence of prior knowledge for problem, and they should also be used in DER as well to illustrate that DER without fine-tuning degrades its performance. The training frequency relative to the incoming data, <italic>H</italic>, and the number of batches replayed per training step, <italic>E</italic>, were specified individually for each benchmark. Additionally, AdaTerm (<xref ref-type="bibr" rid="B22">Ilboudo et al., 2023</xref>) was employed by default for optimization using stochastic gradient descent. Note that the computational cost of this setting is summarized in <xref ref-type="supplementary-material" rid="SM1">Appendix 2</xref>.</p>
<p>Among the hyperparameters involved in the proposed method, &#x003C1; in <xref ref-type="disp-formula" rid="EQ13">Equation 9</xref>, which defines the quantile function, appears to be particularly important, and its influence was investigated in <xref ref-type="supplementary-material" rid="SM1">Appendix 3</xref>. Consequently, &#x003C1; &#x0003D; 0.5 (i.e., the median value) was found to be robustly appropriate. Although there is room for adjusting &#x003BB;, defined in <xref ref-type="disp-formula" rid="EQ24">Equation 17</xref>, it was set to 0.5, which is the midpoint of its domain, for simplicity. Note that since &#x003BB; &#x0003D; 0.5 does not provide much smoothing, it might be allowable not to introduce <inline-formula><mml:math id="M72"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. <italic>q</italic> required for <xref ref-type="disp-formula" rid="EQ24">Equation 17</xref> was set to 1 for the last buffer, based on comparisons with other designs presented in <xref ref-type="supplementary-material" rid="SM1">Appendix 4</xref>. For simplicity, <italic>L</italic> &#x0003D; 2 was used for the <italic>plural</italic> strategy in this study, and for the first buffer, <italic>q</italic> &#x0003D; 1.5 was used to emphasize plasticity. Finally, the probability of rejecting inconsistent data when transferring between buffers was empirically set to &#x003B6; &#x0003D; 0.2, so that excessive counter growth could be suppressed to some extent via rejection.</p>
<p>The above settings are summarized in <xref ref-type="table" rid="T1">Table 1</xref>. Although there is room for fine-tuning these settings depending on the problem, this does not appear to be very important because the effectiveness of the proposed method has been confirmed across many benchmark problems, as shown below. Note that, as already mentioned in Section 2.1.4, since the baseline method for satisfying the task-agnostic setting under limited computational resources is limited to DER, subsequent comparisons are conducted only using DER and ablation studies of the proposed method.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Parameter configuration.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Symbol</bold></th>
<th valign="top" align="left"><bold>Meaning</bold></th>
<th valign="top" align="left"><bold>Value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>N</italic><sup>FIFO</sup></td>
<td valign="top" align="left">Size of FIFO buffer</td>
<td valign="top" align="left">512</td>
</tr>
<tr>
<td valign="top" align="left"><italic>N</italic><sup>RS</sup></td>
<td valign="top" align="left">Size of RS buffer(s)</td>
<td valign="top" align="left">512</td>
</tr>
<tr>
<td valign="top" align="left"><italic>B</italic></td>
<td valign="top" align="left">Batch size</td>
<td valign="top" align="left">32</td>
</tr>
<tr>
<td valign="top" align="left">&#x003B1;<sup>ini</sup></td>
<td valign="top" align="left">Initial &#x003B1; in DER</td>
<td valign="top" align="left">1</td>
</tr>
<tr>
<td valign="top" align="left">&#x003B2;<sup>ini</sup></td>
<td valign="top" align="left">Initial &#x003B2; in DER</td>
<td valign="top" align="left">0.5</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C1;</td>
<td valign="top" align="left">Quantile for threshold computation</td>
<td valign="top" align="left">0.5</td>
</tr>
<tr>
<td valign="top" align="left">&#x003BB;</td>
<td valign="top" align="left">Smoothness of data priority</td>
<td valign="top" align="left">0.5</td>
</tr>
<tr>
<td valign="top" align="left"><italic>q</italic></td>
<td valign="top" align="left">Parameter for generalized counter(s)</td>
<td valign="top" align="left">(1.5, 1)</td>
</tr>
<tr>
<td valign="top" align="left">&#x003B6;</td>
<td valign="top" align="left">Rejection probability</td>
<td valign="top" align="left">0.2</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>Results of A2ER</title>
<sec>
<title>Toy problems</title>
<p>First, to validate the effectiveness of A2ER, simple regression and classification problems were conducted, as shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. These detailed conditions are described later, but due to factors such as low learning frequency, they were set such that learning cannot be satisfactorily achieved unless the algorithm possesses sufficient plasticity while keeping consolidation as well. A neural network model consisting of two fully connected layers with 32 neurons each was trained for both toy problems. The model outputs a normal distribution for the regression problem and a categorical distribution for the classification problem. In both cases, the model was optimized by minimizing the negative log-likelihood with respect to the supervised data as the loss function.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Examples of toy problems. <bold>(a)</bold> Regression. <bold>(b)</bold> Classification.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0006.tif">
<alt-text content-type="machine-generated">Figure with two panels: Panel (a) shows a line graph with a jagged, upward-trending blue line and shaded uncertainty, over x and y axes from roughly negative three to three. Panel (b) displays a filled contour plot with several color regions&#x02014;red, blue, yellow, and orange&#x02014;bounded by black contour lines over axes ranging from negative one to one.</alt-text>
</graphic>
</fig>
<p>The regression problem aimed to predict the one-dimensional output of a composite function of up to seven sine waves with different phases, frequencies, and amplitudes, given a one-dimensional input in the range [&#x02212;2.5, 2.5]. During training, outputs with white noise (SD = 0.1) were obtained as the input increased in 0.001 increments from the minimum value, and the resulting input-output pairs were passed to the model sequentially. One cycle contained 5,000 data points, exceeding the buffer size, and was repeated five times to achieve sufficient accuracy. Training occurred after every 16 data points were added, and up to 16 batches were replayed from each buffer during a single training session. This problem was evaluated using the sum of Kullback-Leibler divergences (KLD) between the predicted and true distributions.</p>
<p>The classification problem aimed to predict which component of a Gaussian mixture distribution, comprising up to 16 components placed on a two-dimensional input space [&#x02212;1, 1]<sup>2</sup>, a given input belongs to, or whether it belongs to none. During training, the input space was divided into grids at intervals of 0.02, with added white noise (SD = 0.01). The outputs were the indices of the components with the highest likelihood. A threshold of 0.3<sup>2</sup> was set; if all likelihoods were below this threshold, the input was assigned an additional index, indicating outliers. This cycle was repeated five times for 10,000 data points to ensure accurate classification. Training was performed after every 32 new data points were added, and up to 16 batches were replayed from each buffer in a single training session. The task was evaluated using classification accuracy (ACC).</p>
<p>Using the above setup, four types of functions (sine waves and Gaussian mixture distributions) were prepared. They were trained under the following methods, each statistically evaluated using 20 different random seeds.</p>
<list list-type="bullet">
<list-item><p><italic>DER</italic>: the conventional method</p></list-item>
<list-item><p><italic>-Aa</italic>: without the <italic>adaptation</italic> strategy for &#x003B1;</p></list-item>
<list-item><p><italic>-Ab</italic>: without the <italic>adaptation</italic> strategy for &#x003B2;</p></list-item>
<list-item><p><italic>-B</italic>: without the <italic>block</italic> strategy</p></list-item>
<list-item><p><italic>-C</italic>: without the <italic>correction</italic> strategy</p></list-item>
<list-item><p><italic>A2ER</italic>: the proposed method</p></list-item>
</list>
<p>Because the purpose of this section is to evaluate A2ER, the conventional implementation of RS (not O2S) was employed.</p>
<p>The learning results are summarized in <xref ref-type="table" rid="T2">Table 2</xref>. To evaluate robustness across problems, a weighted average based on the ranks of the evaluation values was used to prioritize the worst cases. The conventional method, DER, was never among the top-2, perhaps because it was not fine-tuned and did not perform sufficiently well. The performance without the <italic>correction</italic> strategy was significantly worse than that of the conventional method. This may be because the <italic>adaptation</italic> strategy assigns too much weight to &#x003B1; without the reduction of the threshold &#x00394;<sub><italic>Q</italic></sub> by the <italic>correction</italic> strategy, inhibiting learning.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Results of toy problems for evaluating A2ER: the average over 20 trials per condition was weighted by rank, prioritizing to the worst case; top-2 results are in bold.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center" colspan="4">Regression (KLD: &#x02193;)</th>
<th valign="top" align="center" colspan="4">Classification (ACC: &#x02191;)</th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>R1</bold></th>
<th valign="top" align="center"><bold>R2</bold></th>
<th valign="top" align="center"><bold>R3</bold></th>
<th valign="top" align="center"><bold>R4</bold></th>
<th valign="top" align="center"><bold>C1</bold></th>
<th valign="top" align="center"><bold>C2</bold></th>
<th valign="top" align="center"><bold>C3</bold></th>
<th valign="top" align="center"><bold>C4</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">DER</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">4.00</td>
<td valign="top" align="center">7.24</td>
<td valign="top" align="center">2.23</td>
<td valign="top" align="center">84.12</td>
<td valign="top" align="center">73.25</td>
<td valign="top" align="center">80.97</td>
<td valign="top" align="center">84.58</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">0.64&#x02013;0.92</td>
<td valign="top" align="center">3.71&#x02013;4.18</td>
<td valign="top" align="center">6.81&#x02013;7.48</td>
<td valign="top" align="center">1.98&#x02013;2.32</td>
<td valign="top" align="center">82.69&#x02013;85.82</td>
<td valign="top" align="center">71.37&#x02013;75.64</td>
<td valign="top" align="center">77.81&#x02013;83.91</td>
<td valign="top" align="center">82.70&#x02013;86.82</td>
</tr>
<tr>
<td valign="top" align="left">-Aa</td>
<td valign="top" align="center">0.49</td>
<td valign="top" align="center"><bold>2.83</bold></td>
<td valign="top" align="center">7.72</td>
<td valign="top" align="center">1.72</td>
<td valign="top" align="center">89.99</td>
<td valign="top" align="center">85.82</td>
<td valign="top" align="center">89.88</td>
<td valign="top" align="center">92.56</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">0.28&#x02013;0.73</td>
<td valign="top" align="center">2.29&#x02013;3.36</td>
<td valign="top" align="center">4.91&#x02013;13.36</td>
<td valign="top" align="center">1.51&#x02013;1.86</td>
<td valign="top" align="center">88.83&#x02013;91.57</td>
<td valign="top" align="center">85.18&#x02013;87.45</td>
<td valign="top" align="center">87.85&#x02013;92.26</td>
<td valign="top" align="center">91.67&#x02013;93.98</td>
</tr>
<tr>
<td valign="top" align="left">-Ab</td>
<td valign="top" align="center"><bold>0.40</bold></td>
<td valign="top" align="center">3.07</td>
<td valign="top" align="center"><bold>6.59</bold></td>
<td valign="top" align="center"><bold>1.63</bold></td>
<td valign="top" align="center">90.00</td>
<td valign="top" align="center">85.91</td>
<td valign="top" align="center">90.74</td>
<td valign="top" align="center"><bold>96.45</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">0.26&#x02013;0.49</td>
<td valign="top" align="center">2.50&#x02013;4.35</td>
<td valign="top" align="center">4.43&#x02013;8.29</td>
<td valign="top" align="center">1.44&#x02013;1.93</td>
<td valign="top" align="center">88.30&#x02013;92.21</td>
<td valign="top" align="center">84.92&#x02013;87.90</td>
<td valign="top" align="center">89.61&#x02013;93.17</td>
<td valign="top" align="center">95.94&#x02013;97.19</td>
</tr>
<tr>
<td valign="top" align="left">-B</td>
<td valign="top" align="center"><bold>0.36</bold></td>
<td valign="top" align="center">2.90</td>
<td valign="top" align="center">7.12</td>
<td valign="top" align="center"><bold>1.69</bold></td>
<td valign="top" align="center"><bold>91.33</bold></td>
<td valign="top" align="center"><bold>87.63</bold></td>
<td valign="top" align="center"><bold>91.77</bold></td>
<td valign="top" align="center">95.79</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">0.28&#x02013;0.45</td>
<td valign="top" align="center">2.36&#x02013;3.13</td>
<td valign="top" align="center">4.15&#x02013;12.07</td>
<td valign="top" align="center">1.44&#x02013;1.85</td>
<td valign="top" align="center">90.61&#x02013;92.76</td>
<td valign="top" align="center">86.71&#x02013;88.34</td>
<td valign="top" align="center">90.94&#x02013;93.40</td>
<td valign="top" align="center">95.29&#x02013;96.63</td>
</tr>
<tr>
<td valign="top" align="left">-C</td>
<td valign="top" align="center">1.56</td>
<td valign="top" align="center">6.19</td>
<td valign="top" align="center">10.44</td>
<td valign="top" align="center">2.14</td>
<td valign="top" align="center">65.99</td>
<td valign="top" align="center">39.68</td>
<td valign="top" align="center">62.24</td>
<td valign="top" align="center">64.78</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">0.83&#x02013;1.96</td>
<td valign="top" align="center">5.56&#x02013;6.41</td>
<td valign="top" align="center">9.50&#x02013;10.66</td>
<td valign="top" align="center">1.81&#x02013;2.34</td>
<td valign="top" align="center">63.38&#x02013;71.11</td>
<td valign="top" align="center">35.69&#x02013;47.84</td>
<td valign="top" align="center">60.25&#x02013;67.82</td>
<td valign="top" align="center">59.90&#x02013;71.34</td>
</tr>
<tr>
<td valign="top" align="left">A2ER</td>
<td valign="top" align="center">0.42</td>
<td valign="top" align="center"><bold>2.87</bold></td>
<td valign="top" align="center"><bold>7.03</bold></td>
<td valign="top" align="center">1.71</td>
<td valign="top" align="center"><bold>91.26</bold></td>
<td valign="top" align="center"><bold>87.52</bold></td>
<td valign="top" align="center"><bold>91.28</bold></td>
<td valign="top" align="center"><bold>95.82</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">0.30&#x02013;0.49</td>
<td valign="top" align="center">2.37&#x02013;3.59</td>
<td valign="top" align="center">4.44&#x02013;10.68</td>
<td valign="top" align="center">1.39&#x02013;1.78</td>
<td valign="top" align="center">90.07&#x02013;92.70</td>
<td valign="top" align="center">87.03&#x02013;88.70</td>
<td valign="top" align="center">89.89&#x02013;93.39</td>
<td valign="top" align="center">95.33&#x02013;96.85</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold indicates top results.</p>
</table-wrap-foot>
</table-wrap>
<p>Under the other conditions where the <italic>correction</italic> strategy was added, the performances were better than that of the conventional method. In particular, the proposed method, A2ER, achieved the highest number of top-2 entries. However, the performance without the <italic>block</italic> strategy was similar, and its usefulness was not confirmed by these benchmarks. On the other hand, by focusing on the case without the <italic>adaptation</italic> strategy, we can find that the automatic adjustment of &#x003B1; is particularly effective. When &#x003B2; was not auto-tuned, the performance degraded mainly in the classification problems. In fact, while &#x003B2; in A2ER stayed around 0.5 for the regression problems, it temporarily increased to over 0.8 for the classification problems. This means that the automatic adjustment of &#x003B2; improves performance by steering it toward an appropriate value, even when the initial setting is not optimal.</p>
<p>The above results indicate that the <italic>adaptation</italic> and <italic>correction</italic> strategies of the proposed method are necessary to improve the performance of DER. The <italic>block</italic> strategy may be activated only when inconsistent data remain in the buffers because of, for example, shifts in the data-generative distributions. The benchmark problems above do not have such characteristics; thus, the effectiveness of the <italic>block</italic> strategy could not be confirmed.</p></sec>
<sec>
<title>Reinforcement learning</title>
<p>To verify the effectiveness of the <italic>block</italic> strategy, additional benchmarks for reinforcement learning (RL) problems, where the data-generative distribution depends on the agent&#x00027;s policy, were conducted. As data generated with past policies would be inconsistent with the current policy, reverting to the inconsistent outputs might cause drastic policy updates, which are prone to make RL unstable (<xref ref-type="bibr" rid="B49">Schulman et al., 2017</xref>; <xref ref-type="bibr" rid="B48">Saglam et al., 2023</xref>). If their replay can be appropriately blocked, a policy should be smoothly updated, improving control performance steadily.</p>
<p>Specifically, two problems in OpenAI Gym, <italic>InvertedDoublePendulum-v4</italic> (DoublePendulum) and <italic>Reacher-v4</italic> (Reacher), were solved using the soft actor-critic (SAC) algorithm (<xref ref-type="bibr" rid="B20">Haarnoja et al., 2018</xref>). The SAC implementation was adapted from that used in the literature (<xref ref-type="bibr" rid="B32">Kobayashi, 2025</xref>). The training occurred after every four interactions, during which up to one batch was replayed for training. DoublePendulum and Reacher were solved with 1,500 and 1,000 episodes, respectively, followed by 100 episodes using the trained policies for evaluation. Note that these tasks were selected as toy problems expected to be solvable even with small buffers, although their performance was lower than when trained with rich computational resources and buffers. In the RL problems, the interquartile mean (IQM) of the 100 episodes&#x00027; returns (i.e., the sum of rewards in an episode) is computed as the score for each trial, according to the literature (<xref ref-type="bibr" rid="B1">Agarwal et al., 2021</xref>).</p>
<p>The following four conditions were compared for the above tasks using 20 random seeds:</p>
<list list-type="bullet">
<list-item><p><italic>FIFO</italic>: using only a FIFO buffer, similar to standard RL algorithms with experience replay (<xref ref-type="bibr" rid="B39">Lin, 1992</xref>; <xref ref-type="bibr" rid="B23">Isele and Cosgun, 2018</xref>).</p></list-item>
<list-item><p><italic>DER</italic>: the conventional method.</p></list-item>
<list-item><p><italic>-B</italic>: the proposed method without the <italic>block</italic> strategy.</p></list-item>
<list-item><p><italic>A2ER</italic>: the proposed method.</p></list-item>
</list>
<p>To ensure fairness, the buffer size in the case using only the FIFO buffer was set to <italic>N</italic><sup>FIFO</sup>&#x02190;<italic>N</italic><sup>FIFO</sup>&#x0002B;<italic>N</italic><sup>RS</sup> &#x0003D; 1024, and the batch size was set to <italic>B</italic>&#x000D7;2 &#x0003D; 64. Note that even with an enlarged FIFO buffer, its size remains much smaller than those typically used in general RL implementations, and generalization performance is expected to decrease because it cannot retain sufficient past information. On the other hand, DER and the proposed method, which include RS buffers and regularization to past outputs, can improve generalization performance. However, excessive consolidation of past outputs may disturb or stop learning because some past data become inconsistent or non-optimized because of distribution shifts or insufficient learning.</p>
<p>The learning results are summarized in <xref ref-type="table" rid="T3">Table 3</xref>. As well as the above benchmarks, a weighted average based on the ranks of the scores was used to prioritize the worst cases and demonstrate robustness. The first notable point is that DER made little progress in learning. This is because the regularization term to maintain past outputs was too strong with &#x003B1; &#x0003D; 1, which prevented the value function from being updated and led to failure in policy optimization. In addition, FIFO, a common implementation in RL, showed a poor success rate on DoublePendulum. In Reacher, on the other hand, FIFO did not fail significantly, although its score did not reach a satisfactory level. This is probably because, although learning proceeded due to high plasticity, the generalization performance to reach arbitrary target positions degraded due to the lack of consolidation caused by the small buffer size.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Results of RL tasks for evaluating A2ER: IQM of 100 returns was employed as a metric (higher is better); the average over 20 trials per condition was weighted by rank, prioritizing to the worst case.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center" colspan="2"><bold>RL (IQM of returns:</bold> &#x02191;<bold>)</bold></th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>DoublePendulum</bold></th>
<th valign="top" align="center"><bold>Reacher</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FIFO</td>
<td valign="top" align="center">873.66</td>
<td valign="top" align="center">-12.29</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">206.71&#x02013;9359.81</td>
<td valign="top" align="center">-13.05&#x02014;10.13</td>
</tr>
<tr>
<td valign="top" align="left">DER</td>
<td valign="top" align="center">60.55</td>
<td valign="top" align="center">-34.86</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">35.53&#x02013;129.96</td>
<td valign="top" align="center">-52.19&#x02014;12.50</td>
</tr>
<tr>
<td valign="top" align="left">-B</td>
<td valign="top" align="center">1905.71</td>
<td valign="top" align="center">-16.08</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">130.18&#x02013;9359.86</td>
<td valign="top" align="center">-27.75&#x02014;9.72</td>
</tr>
<tr>
<td valign="top" align="left">A2ER</td>
<td valign="top" align="center"><bold>2896.43</bold></td>
<td valign="top" align="center"><bold>-11.69</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">353.61&#x02013;9359.65</td>
<td valign="top" align="center">-12.51&#x02014;8.43</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate top result.</p>
</table-wrap-foot>
</table-wrap>
<p>However, the proposed method, A2ER, outperformed the other methods on both problems. Although Reacher&#x00027;s score seems not to differ significantly from that of FIFO, both the worst and best cases of A2ER were better than those of FIFO, indicating that A2ER provided more stable generalization performance. This result is largely owing to the <italic>block</italic> strategy, as a clear performance drop was observed in its absence. In particular, the number of failure cases increased for both problems, with the worst case for Reacher being more than twice as severe. Thus, the <italic>block</italic> strategy mitigated the influence of inconsistent past data, which causes instability in learning and deterioration of generalization performance.</p>
<p>Based on the above results, we can conclude that all three strategies in the proposed method, A2ER, are essential for enhancing the performance of DER.</p>
</sec>
</sec>
<sec>
<title>Results of O2S</title>
<p>Next, O2S, a modified RS, was evaluated using a goal-conditioned RL (<xref ref-type="bibr" rid="B40">Liu et al., 2022</xref>) problem, which demand both memory consolidation and plasticity. In such problems, an agent must learn a policy capable of achieving any goal, with different goals randomly assigned in each episode. If the implementation relies solely on a (small) FIFO buffer and is overly plastic, it may quickly forget how to achieve earlier goals conditioned in the first half of the learning process. Conversely, if the system emphasizes consolidation too strongly, such as with the standard RS, it may fail to adapt to goals conditioned in the second half of the learning process. An appropriate balance can be achieved by leveraging the buffers&#x00027; ability to retain past data, alongside moderate updates with new data and the active exclusion of inconsistent data.</p>
<p>This problem is based on a modified version of <italic>PandaReachDense-v3</italic> in Panda-Gym (<xref ref-type="bibr" rid="B19">Gallou&#x000E9;dec et al., 2021</xref>) (see <xref ref-type="fig" rid="F7">Figure 7</xref>). Three key modifications were introduced. First, the initial joint angles of the robot were randomized using a uniform distribution. Second, the goal position for the robot&#x00027;s end effector was vertically offset by the length of the fingertips, with the offset range specified by a uniform distribution. Third, when the joint space was used as the action space, an orientation error penalty for the fingertip was added to the reward function. In addition, the end-effector orientation and joint angles were appended to the state space (originally, the end-effector position and velocity) to support this modification. Under these modifications, training was conducted for over 10,000 episodes using SAC with A2ER, following the same protocol as in the previous RL problems. The problems were categorized into three types based on the inclusion of noise: uniform noise with a width of &#x003C0;/4 added to the initial joint angles (N), and/or uniform noise with a width of 0.4 added to the goal position (G). Furthermore, the action space was defined either in the 3-DOF end-effector position space (E) or the 7-DOF joint space (J), resulting in six total problem variants.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Snapshots of goal-conditioned RL.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0007.tif">
<alt-text content-type="machine-generated">Simulation graphic showing a robotic arm on a platform. The left panel displays the arm reaching toward a green dot labeled &#x0201C;Goal&#x0201C;. The right panel shows the arm positioned with its gripper over the former goal point.</alt-text>
</graphic>
</fig>
<p>Because the three strategies in O2S are implemented in stages, performing ablation tests in the same manner as in the previous benchmarks is not feasible. Instead, the strategies were incrementally added to the original RS to assess the individual contributions of each. Accordingly, the following four methods were evaluated:</p>
<list list-type="bullet">
<list-item><p><italic>RS</italic>: the conventional method.</p></list-item>
<list-item><p><italic>Q2S</italic>: RS with only the <italic>q-logarithm</italic> strategy.</p></list-item>
<list-item><p><italic>P2S</italic>: without the <italic>omission</italic> strategy.</p></list-item>
<list-item><p><italic>O2S</italic>: the proposed method.</p></list-item>
</list>
<p>Note that P2S and O2S incorporate the <italic>plural</italic> strategy with <italic>L</italic> &#x0003D; 2 layers; thus, each of the two RS buffers is assigned a size of <italic>N</italic><sup>RS</sup>/2 &#x0003D; 256.</p>
<p>The results of 20 trials for each condition, using different random seeds, are summarized in <xref ref-type="table" rid="T4">Table 4</xref>. The evaluation method follows the same procedure used for the RL problems described earlier. A general trend observed was that tasks using the joint space were more challenging than those using the end-effector position space, resulting in lower returns (partly owing to the inclusion of the orientation error penalty). When noise was added only to the initial joint angles, fewer than 10,000 episodes were typically sufficient for learning, as similar states could be encountered through trial-and-error. However, the difficulty increased significantly when noise was added to the goal position. In such cases, the conventional RS failed to learn an optimal policy capable of reaching multiple goals.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Results of goal-conditioned RL tasks for evaluating O2S: the average over 20 trials per condition was weighted by rank, prioritizing to the worst case; top-2 results are in bold.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center" colspan="3">End-effector space</th>
<th valign="top" align="center" colspan="3">Joint space</th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>E&#x0002B;N-G</bold></th>
<th valign="top" align="center"><bold>E-N&#x0002B;G</bold></th>
<th valign="top" align="center"><bold>E&#x0002B;N&#x0002B;G</bold></th>
<th valign="top" align="center"><bold>J&#x0002B;N-G</bold></th>
<th valign="top" align="center"><bold>J-N&#x0002B;G</bold></th>
<th valign="top" align="center"><bold>J&#x0002B;N&#x0002B;G</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">RS</td>
<td valign="top" align="center">-3.62</td>
<td valign="top" align="center">-6.25</td>
<td valign="top" align="center">-6.84</td>
<td valign="top" align="center">-7.30</td>
<td valign="top" align="center">-15.64</td>
<td valign="top" align="center">-16.13</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">-5.63&#x02014;1.70</td>
<td valign="top" align="center">-11.57&#x02014;3.21</td>
<td valign="top" align="center">-11.38&#x02014;3.52</td>
<td valign="top" align="center">-11.86&#x02014;4.36</td>
<td valign="top" align="center">-19.80&#x02014;8.02</td>
<td valign="top" align="center">-21.14&#x02014;11.71</td>
</tr>
<tr>
<td valign="top" align="left">Q2S</td>
<td valign="top" align="center">-2.47</td>
<td valign="top" align="center">-3.97</td>
<td valign="top" align="center"><bold>-4.22</bold></td>
<td valign="top" align="center">-6.19</td>
<td valign="top" align="center">-9.14</td>
<td valign="top" align="center"><bold>-8.13</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">-3.28&#x02014;1.17</td>
<td valign="top" align="center">-7.00&#x02014;2.40</td>
<td valign="top" align="center">-5.30&#x02014;3.22</td>
<td valign="top" align="center">-8.58&#x02014;3.83</td>
<td valign="top" align="center">-12.06&#x02014;5.19</td>
<td valign="top" align="center">-9.64&#x02014;5.81</td>
</tr>
<tr>
<td valign="top" align="left">P2S</td>
<td valign="top" align="center"><bold>-2.25</bold></td>
<td valign="top" align="center"><bold>-3.77</bold></td>
<td valign="top" align="center">-4.53</td>
<td valign="top" align="center"><bold>-5.03</bold></td>
<td valign="top" align="center"><bold>-9.00</bold></td>
<td valign="top" align="center">-8.17</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">-3.04&#x02014;1.34</td>
<td valign="top" align="center">-6.15&#x02014;1.94</td>
<td valign="top" align="center">-5.30&#x02014;2.85</td>
<td valign="top" align="center">-8.06&#x02014;3.70</td>
<td valign="top" align="center">-16.95&#x02014;4.19</td>
<td valign="top" align="center">-9.65&#x02014;5.55</td>
</tr>
<tr>
<td valign="top" align="left">O2S</td>
<td valign="top" align="center"><bold>-2.36</bold></td>
<td valign="top" align="center"><bold>-3.36</bold></td>
<td valign="top" align="center"><bold>-4.11</bold></td>
<td valign="top" align="center"><bold>-5.48</bold></td>
<td valign="top" align="center"><bold>-7.64</bold></td>
<td valign="top" align="center"><bold>-7.63</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">-2.81&#x02014;1.27</td>
<td valign="top" align="center">-4.63&#x02014;2.30</td>
<td valign="top" align="center">-5.48&#x02014;2.83</td>
<td valign="top" align="center">-8.53&#x02014;3.26</td>
<td valign="top" align="center">-9.31&#x02014;4.34</td>
<td valign="top" align="center">-9.20&#x02014;5.24</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold indicates top results.</p>
</table-wrap-foot>
</table-wrap>
<p>By contrast, incorporating the <italic>q-logarithm</italic> strategy to suppress the decay of acceptance probability for new data in the RS buffer enabled more goal variations to be retained. This lead to a notable improvement in performance. Although the overall performance gain from the <italic>plural</italic> strategy appeared minor, a clear enhancement was observed in the best-case results, particularly in cases involving randomized goals. That is, although the <italic>plural</italic> strategy does not fully stabilize learning, it has the potential to enhance the expected performance.</p>
<p>Finally, with the addition of the <italic>omission</italic> strategy, top-2 performance was achieved across all problems. In particular, a substantial improvement was observed when goal positions were randomized. Moreover, the worst-case outcomes improved in nearly every case, indicating enhanced stability in learning. These improvements do not appear to stem from increased plasticity, as seen with the <italic>q-logarithm</italic> and <italic>plural</italic> strategies. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, the final acceptance probability of the <italic>L</italic>-th RS buffer highlights that the conventional RS lacks plasticity, and that this limitation is mitigated by the <italic>q-logarithm</italic> strategy. The <italic>plural</italic> strategy also contributed to increased plasticity, as evidenced by the higher acceptance of new data in the first RS buffer. However, the increase in acceptance probability resulting from the <italic>omission</italic> strategy was small, indicating that it does not significantly affect plasticity. Instead, this suggests that the <italic>omission</italic> strategy primarily aids in data selection. By actively removing inconsistent data, it maintains a more informative buffer, thereby promoting more stable learning.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Final acceptance probability of the <italic>L</italic>-th RS buffer.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Condition</bold></th>
<th valign="top" align="center"><bold>Probability [%]</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">RS</td>
<td valign="top" align="center"><italic>L</italic> &#x0003D; 1, <italic>q</italic> &#x0003D; 0</td>
<td valign="top" align="center">0.10</td>
</tr>
<tr>
<td valign="top" align="left">Q2S</td>
<td valign="top" align="center"><italic>L</italic> &#x0003D; 1, <italic>q</italic> &#x0003D; 1</td>
<td valign="top" align="center">12.69</td>
</tr>
<tr>
<td valign="top" align="left">P2S</td>
<td valign="top" align="center"><italic>L</italic> &#x0003D; 2, <italic>q</italic> &#x0003D; 1, &#x003B6; &#x0003D; 0</td>
<td valign="top" align="center">13.32</td>
</tr>
<tr>
<td valign="top" align="left">O2S</td>
<td valign="top" align="center"><italic>L</italic> &#x0003D; 2, <italic>q</italic> &#x0003D; 1, &#x003B6; &#x0003D; 0.2</td>
<td valign="top" align="center"><bold>13.43</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate top result.</p>
</table-wrap-foot>
</table-wrap>
<p>These results indicate that all three strategies incorporated in the proposed method, O2S, are essential for enhancing the performance of RS.</p>
</sec>
<sec>
<title>Image classification benchmarks compared to the state-of-the-art methods</title>
<p>Finally, an in-depth investigation of the proposed method&#x00027;s performance is conducted using common CL benchmarks and other baseline methods, which satisfy the problem settings of the proposed method. The first benchmarks are Permuted MNIST and Split MNIST. Both involve sequentially learning five tasks. To increase problem complexity, Random Erasing (<xref ref-type="bibr" rid="B60">Zhong et al., 2020</xref>) with default settings is applied to the training data. To match this increased complexity, the number of neurons in the two fully connected layers is increased to 512, and the batch size is increased to 128. The learning frequency is set to one batch per 128 data points passed. Furthermore, to highlight the importance of plasticity, the learning rate is reduced to 10<sup>&#x02212;5</sup>, and the number of epochs per task is limited to five. Therefore, the final classification accuracy is considered a reference record, and the focus is on relative accuracy differences.</p>
<p>The comparison methods include the basic DER and the proposed methods, A2ER and O2S (over A2ER), along with two state-of-the-art general-purpose CL methods that incorporate plasticity. Note that for the target benchmarks, memory consolidation is more important than in the above experiments (even with modified conditions emphasizing plasticity), so the O2S hyperparameters were adjusted to (<italic>q</italic><sub>1</sub>, <italic>q</italic><sub>2</sub>) &#x0003D; (0.5, 0.0).</p>
<p>Specifically, the first baseline is greedy sample selection (GSS) (<xref ref-type="bibr" rid="B4">Aljundi et al., 2019c</xref>). GSS selectively stores highly-novel data compared to the buffer&#x00027;s data, making the model plastic. Although GSS needs expensive per-sample gradients to compute novelty, its settings are enough general as like the proposed method. It is possible to combine DER&#x00027;s function regularization, but the implemented GSS stands on experience replay (i.e., DER with &#x003B1; &#x0003D; 0) to align with the original implementation. Second is continual back propagation (CBP) (<xref ref-type="bibr" rid="B15">Dohare et al., 2024</xref>). CBP enhances plasticity by probabilistically reinitializing parameters with low contribution, which is added to the basic DER in this experiment. However, CBP&#x00027;s default settings are optimized for extremely long-term streams, making reinitialization infrequent in this experiment. Therefore, the probability of reinitialization was increased from 10<sup>&#x02212;4</sup> to 10<sup>&#x02212;3</sup>. Furthermore, since the regularization term that retains past outputs in the basic DER obviously interferes with reinitialization, &#x003B1; &#x0003D; 0.1, smaller than the default (i.e., &#x003B1; &#x0003D; 1), is set. Third is layerwise proximal replay (LPR) (<xref ref-type="bibr" rid="B59">Yoo et al., 2024</xref>). LPR implicitly retains past (layer-wise) outputs by projecting gradients. Since it updates the projection matrix at fixed intervals (default 10), it maintains plasticity without retaining overly old past outputs like the basic DER. However, adding LPR to the basic DER would have potential loss of the deliberately retained plasticity. Therefore, LPR is combined with experience replay (i.e., DER with &#x003B1; &#x0003D; 0). The hyperparameter &#x003C9; was set to 1, one of the recommended values. Fourth is gradient-guided epsilon constraint (GEC) (<xref ref-type="bibr" rid="B36">Lai et al., 2025</xref>). GEC treats CL as a constrained optimization problem, similar to the proposed method, and allows adjusting plasticity through the tolerance of the constraint. Like others, GEC can also combine DER&#x00027;s function regularization, but the implementation follows the original paper without it. Note that hyperparameters were left as original.</p>
<p>Under the above conditions, the learning curves for the test data obtained from 20 trials with different random seeds are plotted in <xref ref-type="fig" rid="F8">Figure 8</xref>. In the basic DER, its low plasticity slowed down learning in both problems, preventing the achievement of sufficient classification accuracy. GSS declined in classification performance on Split MNIST, likely because its buffer was easily overwritten by new class data. In CBP, the classification accuracy for Permuted MNIST was improved from the basic DER. However, probably due to the reset of classifier for past classes (i.e., a kind of intentional forgetting), CBP did not achieve sufficient overall classification accuracy on Split MNIST. Similarly, LPR was able to quickly learn new data while maintaining classification accuracy for past data on Permuted MNIST (though hidden behind the line of GEC), but on Split MNIST, its learning lagged behind the basic DER. This might be because the gradients for learning the new class classifier tends to vanish by the projection, and it took time for the projection adjustment for the new classes. Among the baselines, only GEC achieved classification performance superior to or equivalent to DER for both problems. Furthermore, GEC&#x00027;s computational time and memory usage were almost the same as that of DER and the proposed methods, while the others showed a significant increase.</p>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Learning curves of permuted MNIST and split MNIST with five tasks.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0008.tif">
<alt-text content-type="machine-generated">Line graph displays accuracy percentages over twenty-five epochs for seven methods&#x02014;DER, GSS, CBP, LPR, GEC, A2ER, and O2S&#x02014;on Permuted MNIST and Split MNIST tasks. All methods gradually improve, reaching over eighty percent accuracy.</alt-text>
</graphic>
</fig>
<p>Compared to these baselines, A2ER obviously achieved superior classification accuracy on both Permuted MNIST and Split MNIST. This demonstrates that the ability to automatically adjust the balance between consolidation and plasticity according to the problem is effective. On the other hand, while adding O2S enabled the highest classification accuracy on Permuted MNIST, it caused catastrophic forgetting in the final task on Split MNIST, leading to a decline in classification accuracy. As mentioned earlier, although the O2S setting prioritized consolidation, it still appears to have resulted in excessive plasticity. Further investigation into automatic hyperparameter tuning and appropriate buffer size allocation might be warranted.</p>
<p>Based on the above results, the more challenging CIFAR-10/100 are solved in a task-incremental setting with 2/10 tasks, respectively. Since this requires image inputs, ResNet18 is employed as the classification model, while its batch normalization was replaced by group normalization. The buffer size was increased to 5120 due to data diversity, but all other conditions are the same as for MNIST.</p>
<p>Referencing the results on MNIST, only GEC was tested as the state-of-the-art method since only it showed high similarity to the proposed method and also delivered superior results (classification performance and computational efficiency). That is, DER, GEC, A2ER, and O2S are compared, as depicted in <xref ref-type="fig" rid="F9">Figure 9</xref>. Note that these learning curves were statistically obtained from 10 trials with different random seeds.</p>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Learning curves of CIFAR-10 with two tasks and CIFAR-100 with 10 tasks.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1649239-g0009.tif">
<alt-text content-type="machine-generated">Side-by-side line graphs compare the accuracy percentages of four methods&#x02014;DER (green), GEC (red), A2ER (blue), and O2S (purple)&#x02014;over training epochs on CIFAR10 and CIFAR100 datasets, showing O2S generally achieves the highest accuracy.</alt-text>
</graphic>
</fig>
<p>Similar to the MNIST case, DER struggled to progress through training due to insufficient plasticity, resulting in inadequate classification accuracy. GEC efficiently acquired new knowledge and improved classification accuracy early in training, but its insufficient consolidation led to forgetting, preventing sufficient performance gains in later tasks. Both the proposed A2ER and O2S achieved higher classification accuracy than these baseline methods, suggesting they achieved an appropriate balance between consolidation and plasticity. Note that forgetting did not occur with O2S, as it did with MNIST, due to the increased buffer size.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="s4">
<title>Conclusion</title>
<p>This paper proposed A2ER and O2S, which introduce three improvement strategies for DER and RS, respectively, as versatile baselines for continual learning without requiring task label information and rich computational resources. Conventional methods primarily focus on maintaining past data and outputs, i.e., memory consolidation, which often compromises memory plasticity, or the ability to adapt to new data, because of the inherent trade-off between the two. The proposed methods were designed to improve this trade-off and achieve a more effective balance between consolidation and plasticity. Numerical experiments demonstrated that each of the proposed strategies contributed to improved performance across benchmarks where both consolidation and plasticity are crucial.</p>
<p>Furthermore, since the proposed A2ER and O2S are designed for operation on systems with limited computational resources, there is no theoretical difference in computational cost compared to the conventional DER, unless the total buffer size and batch size are increased according to <italic>L</italic>&#x0003E;1. In fact, there was little difference in the training time required for each method during the experiments. This lightweight computational cost not only enables operation in small-scale systems like those discussed in this paper but also suggests scalability to large-scale problems.</p>
<p>However, the proposed method involves several hyperparameters, even though they are robust to problems. As future work, automatic tuning of these parameters is an important next step. In this context, it will be essential to assess whether the problem requires greater consolidation or plasticity, and to accurately identify which data should be discarded from replayed or storage. Achieving this would enable the proposed method to be applied for the training of large-scale practical models on extensive data streams. In particular, the proposed method holds promise in domains where data volume increases continuously, such as robotic foundation models (<xref ref-type="bibr" rid="B17">Firoozi et al., 2023</xref>) and the analyses of complex human motions (<xref ref-type="bibr" rid="B35">Kong and Fu, 2022</xref>). By conducting a comprehensive comparison with recent continual learning methods in these domains, it is believed that the proposed A2ER and O2S will be established as the standard in this field.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>TK: Conceptualization, Resources, Writing &#x02013; review &#x00026; editing, Supervision, Project administration, Validation, Investigation, Writing &#x02013; original draft, Methodology, Visualization, Data curation, Funding acquisition, Software, Formal analysis.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s8">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s10">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2026.1649239/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frai.2026.1649239/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Supplementary_file_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Agarwal</surname> <given-names>R.</given-names></name> <name><surname>Schwarzer</surname> <given-names>M.</given-names></name> <name><surname>Castro</surname> <given-names>P. S.</given-names></name> <name><surname>Courville</surname> <given-names>A. C.</given-names></name> <name><surname>Bellemare</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Deep reinforcement learning at the edge of the statistical precipice,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> <volume>34</volume>, <fpage>29304</fpage>&#x02013;<lpage>29320</lpage>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aljundi</surname> <given-names>R.</given-names></name> <name><surname>Belilovsky</surname> <given-names>E.</given-names></name> <name><surname>Tuytelaars</surname> <given-names>T.</given-names></name> <name><surname>Charlin</surname> <given-names>L.</given-names></name> <name><surname>Caccia</surname> <given-names>M.</given-names></name> <name><surname>Lin</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2019a</year>). <article-title>&#x0201C;Online continual learning with maximal interfered retrieval,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, 32.</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Aljundi</surname> <given-names>R.</given-names></name> <name><surname>Kelchtermans</surname> <given-names>K.</given-names></name> <name><surname>Tuytelaars</surname> <given-names>T.</given-names></name></person-group> (<year>2019b</year>). <article-title>&#x0201C;Task-free continual learning,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>11254</fpage>&#x02013;<lpage>11263</lpage>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aljundi</surname> <given-names>R.</given-names></name> <name><surname>Lin</surname> <given-names>M.</given-names></name> <name><surname>Goujaud</surname> <given-names>B.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name></person-group> (<year>2019c</year>). <article-title>&#x0201C;Gradient based sample selection for online continual learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> 32.</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Beck</surname> <given-names>A.</given-names></name> <name><surname>Teboulle</surname> <given-names>M.</given-names></name></person-group> (<year>2003</year>). <article-title>Mirror descent and nonlinear projected subgradient methods for convex optimization</article-title>. <source>Operat. Res. Lett</source>. <volume>31</volume>, <fpage>167</fpage>&#x02013;<lpage>175</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0167-6377(02)00231-6</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Boschini</surname> <given-names>M.</given-names></name> <name><surname>Bonicelli</surname> <given-names>L.</given-names></name> <name><surname>Buzzega</surname> <given-names>P.</given-names></name> <name><surname>Porrello</surname> <given-names>A.</given-names></name> <name><surname>Calderara</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>Class-incremental continual learning into the extended der-verse</article-title>. <source>IEEE Trans. Pattern Analy. Mach. Intellig</source>. <volume>45</volume>, <fpage>5497</fpage>&#x02013;<lpage>5512</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3206549</pub-id><pub-id pub-id-type="pmid">37030698</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Brignac</surname> <given-names>D.</given-names></name> <name><surname>Lobo</surname> <given-names>N.</given-names></name> <name><surname>Mahalanobis</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Improving replay sample selection and storage for less forgetting in continual learning,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Paris</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3540</fpage>&#x02013;<lpage>3549</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCVW60793.2023.00380</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Buzzega</surname> <given-names>P.</given-names></name> <name><surname>Boschini</surname> <given-names>M.</given-names></name> <name><surname>Porrello</surname> <given-names>A.</given-names></name> <name><surname>Abati</surname> <given-names>D.</given-names></name> <name><surname>Calderara</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Dark experience for general continual learning: a strong, simple baseline,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> <volume>33</volume>, <fpage>15920</fpage>&#x02013;<lpage>15930</lpage>.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Buzzega</surname> <given-names>P.</given-names></name> <name><surname>Boschini</surname> <given-names>M.</given-names></name> <name><surname>Porrello</surname> <given-names>A.</given-names></name> <name><surname>Calderara</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Rethinking experience replay: a bag of tricks for continual learning,&#x0201D;</article-title> in <source>International Conference on Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>2180</fpage>&#x02013;<lpage>2187</lpage>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Caccia</surname> <given-names>L.</given-names></name> <name><surname>Aljundi</surname> <given-names>R.</given-names></name> <name><surname>Asadi</surname> <given-names>N.</given-names></name> <name><surname>Tuytelaars</surname> <given-names>T.</given-names></name> <name><surname>Pineau</surname> <given-names>J.</given-names></name> <name><surname>Belilovsky</surname> <given-names>E.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;New insights on reducing abrupt representation change in online continual learning,&#x0201D;</article-title> in <source>International Conference on Learning Representations</source>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Chrysakis</surname> <given-names>A.</given-names></name> <name><surname>Moens</surname> <given-names>M.-F.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Online continual learning from imbalanced data,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1952</fpage>&#x02013;<lpage>1961</lpage>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cormode</surname> <given-names>G.</given-names></name> <name><surname>Shkapenyuk</surname> <given-names>V.</given-names></name> <name><surname>Srivastava</surname> <given-names>D.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name></person-group> (<year>2009</year>). <article-title>&#x0201C;Forward decay: a practical time decay model for streaming systems,&#x0201D;</article-title> in <source>IEEE International Conference on Data Engineering</source> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>138</fpage>&#x02013;<lpage>149</lpage>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cowan</surname> <given-names>N.</given-names></name></person-group> (<year>2008</year>). <article-title>What are the differences between long-term, short-term, and working memory?</article-title> <source>Prog. Brain Res</source>. <volume>169</volume>, <fpage>323</fpage>&#x02013;<lpage>338</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0079-6123(07)00020-9</pub-id><pub-id pub-id-type="pmid">18394484</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Daxberger</surname> <given-names>E.</given-names></name> <name><surname>Swaroop</surname> <given-names>S.</given-names></name> <name><surname>Osawa</surname> <given-names>K.</given-names></name> <name><surname>Yokota</surname> <given-names>R.</given-names></name> <name><surname>Turner</surname> <given-names>R. E.</given-names></name> <name><surname>Hern&#x000E1;ndez-Lobato</surname> <given-names>J. M.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Improving continual learning by accurate gradient reconstructions of the past,&#x0201D;</article-title> in <source>Transactions on Machine Learning Research</source>.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dohare</surname> <given-names>S.</given-names></name> <name><surname>Hernandez-Garcia</surname> <given-names>J. F.</given-names></name> <name><surname>Lan</surname> <given-names>Q.</given-names></name> <name><surname>Rahman</surname> <given-names>P.</given-names></name> <name><surname>Mahmood</surname> <given-names>A. R.</given-names></name> <name><surname>Sutton</surname> <given-names>R. S.</given-names></name></person-group> (<year>2024</year>). <article-title>Loss of plasticity in deep continual learning</article-title>. <source>Nature</source> <volume>632</volume>, <fpage>768</fpage>&#x02013;<lpage>774</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41586-024-07711-7</pub-id><pub-id pub-id-type="pmid">39169245</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Farajtabar</surname> <given-names>M.</given-names></name> <name><surname>Azizan</surname> <given-names>N.</given-names></name> <name><surname>Mott</surname> <given-names>A.</given-names></name> <name><surname>Li</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Orthogonal gradient descent for continual learning,&#x0201D;</article-title> in <source>International Conference on Artificial Intelligence and Statistics</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>3762</fpage>&#x02013;<lpage>3773</lpage>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Firoozi</surname> <given-names>R.</given-names></name> <name><surname>Tucker</surname> <given-names>J.</given-names></name> <name><surname>Tian</surname> <given-names>S.</given-names></name> <name><surname>Majumdar</surname> <given-names>A.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Foundation models in robotics: applications, challenges, and the future</article-title>. <source>Int. J. Robot. Res</source>. <volume>44</volume>, <fpage>701</fpage>&#x02013;<lpage>39</lpage>. doi: <pub-id pub-id-type="doi">10.1177/02783649241281508</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Frank</surname> <given-names>M. G.</given-names></name> <name><surname>Benington</surname> <given-names>J. H.</given-names></name></person-group> (<year>2006</year>). <article-title>The role of sleep in memory consolidation and brain plasticity: dream or reality?</article-title> <source>Neuroscientist</source> <volume>12</volume>, <fpage>477</fpage>&#x02013;<lpage>488</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1073858406293552</pub-id><pub-id pub-id-type="pmid">17079514</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gallou&#x000E9;dec</surname> <given-names>Q.</given-names></name> <name><surname>Cazin</surname> <given-names>N.</given-names></name> <name><surname>Dellandr&#x000E9;a</surname> <given-names>E.</given-names></name> <name><surname>Chen</surname> <given-names>L.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;panda-gym: Open-source goal-conditioned environments for robotic learning,&#x0201D;</article-title> in <source>Robot Learning Workshop: Self-Supervised and Lifelong Learning</source> &#x00040; <italic>NeurIPS 2021</italic> (Curran Associates).</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Haarnoja</surname> <given-names>T.</given-names></name> <name><surname>Zhou</surname> <given-names>A.</given-names></name> <name><surname>Abbeel</surname> <given-names>P.</given-names></name> <name><surname>Levine</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Soft actor-critic: off-policy maximum entropy deep reinforcement learning with a stochastic actor,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1861</fpage>&#x02013;<lpage>1870</lpage>.</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Harun</surname> <given-names>M. Y.</given-names></name> <name><surname>Gallardo</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Kanan</surname> <given-names>C.</given-names></name></person-group> (<year>2025</year>). <article-title>&#x0201C;Grasp: a rehearsal policy for efficient online continual learning,&#x0201D;</article-title> in <source>Conference on Lifelong Learning Agents</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1032</fpage>&#x02013;<lpage>1052</lpage>.</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ilboudo</surname> <given-names>W. E. L.</given-names></name> <name><surname>Kobayashi</surname> <given-names>T.</given-names></name> <name><surname>Matsubara</surname> <given-names>T.</given-names></name></person-group> (<year>2023</year>). <article-title>Adaterm: Adaptive t-distribution estimated robust moments for noise-robust stochastic gradient optimization</article-title>. <source>Neurocomputing</source> <volume>557</volume>:<fpage>126692</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neucom.2023.126692</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Isele</surname> <given-names>D.</given-names></name> <name><surname>Cosgun</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Selective experience replay for lifelong learning,&#x0201D;</article-title> in <source>Proceedings of the AAAI Conference on Artificial Intelligence</source> (<publisher-loc>AAAI</publisher-loc>).</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Johnson</surname> <given-names>H. M.</given-names></name></person-group> (<year>1994</year>). <article-title>Processes of successful intentional forgetting</article-title>. <source>Psychol. Bullet</source>. <volume>116</volume>:<fpage>274</fpage>. doi: <pub-id pub-id-type="doi">10.1037//0033-2909.116.2.274</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Josselyn</surname> <given-names>S. A.</given-names></name> <name><surname>Tonegawa</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>Memory engrams: recalling the past and imagining the future</article-title>. <source>Science</source> <volume>367</volume>:<fpage>eaaw4325</fpage>. doi: <pub-id pub-id-type="doi">10.1126/science.aaw4325</pub-id><pub-id pub-id-type="pmid">31896692</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jung</surname> <given-names>D.</given-names></name> <name><surname>Lee</surname> <given-names>D.</given-names></name> <name><surname>Hong</surname> <given-names>S.</given-names></name> <name><surname>Jang</surname> <given-names>H.</given-names></name> <name><surname>Bae</surname> <given-names>H.</given-names></name> <name><surname>Yoon</surname> <given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;New insights for the stability-plasticity dilemma in online continual learning,&#x0201D;</article-title> in <source>International Conference on Learning Representations</source>.</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kang</surname> <given-names>H.</given-names></name> <name><surname>Mina</surname> <given-names>R. J. L.</given-names></name> <name><surname>Madjid</surname> <given-names>S. R. H.</given-names></name> <name><surname>Yoon</surname> <given-names>J.</given-names></name> <name><surname>Hasegawa-Johnson</surname> <given-names>M.</given-names></name> <name><surname>Hwang</surname> <given-names>S. J.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Forget-free continual learning with winning subnetworks,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>10734</fpage>&#x02013;<lpage>10750</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khan</surname> <given-names>M. E. E.</given-names></name> <name><surname>Swaroop</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Knowledge-adaptation priors,&#x0201D;</article-title> in <source>Advances in neural Information Processing Systems</source> <volume>34</volume>, <fpage>19757</fpage>&#x02013;<lpage>19770</lpage>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Noci</surname> <given-names>L.</given-names></name> <name><surname>Orvieto</surname> <given-names>A.</given-names></name> <name><surname>Hofmann</surname> <given-names>T.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Achieving a better stability-plasticity trade-off via auxiliary networks in continual learning,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>11930</fpage>&#x02013;<lpage>11939</lpage>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kirillov</surname> <given-names>A.</given-names></name> <name><surname>Mintun</surname> <given-names>E.</given-names></name> <name><surname>Ravi</surname> <given-names>N.</given-names></name> <name><surname>Mao</surname> <given-names>H.</given-names></name> <name><surname>Rolland</surname> <given-names>C.</given-names></name> <name><surname>Gustafson</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Segment anything,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Paris</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4015</fpage>&#x02013;<lpage>4026</lpage>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kirkpatrick</surname> <given-names>J.</given-names></name> <name><surname>Pascanu</surname> <given-names>R.</given-names></name> <name><surname>Rabinowitz</surname> <given-names>N.</given-names></name> <name><surname>Veness</surname> <given-names>J.</given-names></name> <name><surname>Desjardins</surname> <given-names>G.</given-names></name> <name><surname>Rusu</surname> <given-names>A. A.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Overcoming catastrophic forgetting in neural networks</article-title>. <source>Proc. Nat. Acad. Sci</source>. <volume>114</volume>, <fpage>3521</fpage>&#x02013;<lpage>3526</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.1611835114</pub-id><pub-id pub-id-type="pmid">28292907</pub-id></mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kobayashi</surname> <given-names>T.</given-names></name></person-group> (<year>2025</year>). <article-title>Intentionally-underestimated value function at terminal state for temporal-difference learning with mis-designed reward</article-title>. <source>Results Cont. Optimizat</source>. <volume>18</volume>:<fpage>100530</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.rico.2025.100530</pub-id></mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kobayashi</surname> <given-names>T.</given-names></name> <name><surname>Sugino</surname> <given-names>T.</given-names></name></person-group> (<year>2020</year>). <article-title>Reinforcement learning for quadrupedal locomotion with design of continual-hierarchical curriculum</article-title>. <source>Eng. Appl. Artif. Intellig</source>. <volume>95</volume>:<fpage>103869</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.engappai.2020.103869</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Koh</surname> <given-names>P. W.</given-names></name> <name><surname>Sagawa</surname> <given-names>S.</given-names></name> <name><surname>Marklund</surname> <given-names>H.</given-names></name> <name><surname>Xie</surname> <given-names>S. M.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Balsubramani</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Wilds: a benchmark of in-the-wild distribution shifts,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>5637</fpage>&#x02013;<lpage>5664</lpage>.</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kong</surname> <given-names>Y.</given-names></name> <name><surname>Fu</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Human action recognition and prediction: a survey</article-title>. <source>Int. J. Comp. Vision</source> <volume>130</volume>, <fpage>1366</fpage>&#x02013;<lpage>1401</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-022-01594-9</pub-id></mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lai</surname> <given-names>S.</given-names></name> <name><surname>Ma</surname> <given-names>C.</given-names></name> <name><surname>Zhu</surname> <given-names>F.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name> <name><surname>Lin</surname> <given-names>X.</given-names></name> <name><surname>Meng</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>&#x0201C;Gradient-guided epsilon constraint method for online continual learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>.</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2015</year>). <article-title>Deep learning</article-title>. <source>Nature</source> <volume>521</volume>, <fpage>436</fpage>&#x02013;<lpage>444</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nature14539</pub-id></mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name> <name><surname>Wu</surname> <given-names>T.</given-names></name> <name><surname>Socher</surname> <given-names>R.</given-names></name> <name><surname>Xiong</surname> <given-names>C.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Learn to grow: a continual structure learning framework for overcoming catastrophic forgetting,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>3925</fpage>&#x02013;<lpage>3934</lpage>.</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>L.-J.</given-names></name></person-group> (<year>1992</year>). <article-title>Self-improving reactive agents based on reinforcement learning, planning and teaching</article-title>. <source>Machine Learn</source>. <volume>8</volume>, <fpage>293</fpage>&#x02013;<lpage>321</lpage>. doi: <pub-id pub-id-type="doi">10.1023/A:1022628806385</pub-id></mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Zhu</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name></person-group> (<year>2022</year>). <article-title>Goal-conditioned reinforcement learning: problems and solutions</article-title>. <source>arXiv</source> [preprint] arXiv:2201.08299. doi: <pub-id pub-id-type="doi">10.24963/ijcai.2022/770</pub-id></mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McClelland</surname> <given-names>J. L.</given-names></name> <name><surname>McNaughton</surname> <given-names>B. L.</given-names></name> <name><surname>O&#x00027;Reilly</surname> <given-names>R. C.</given-names></name></person-group> (<year>1995</year>). <article-title>Why there are complementary learning systems in the hippocampus and neocortex: insights from the successes and failures of connectionist models of learning and memory</article-title>. <source>Psychol. Rev</source>. <volume>102</volume>:<fpage>419</fpage>. doi: <pub-id pub-id-type="doi">10.1037/0033-295X.102.3.419</pub-id><pub-id pub-id-type="pmid">7624455</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mermillod</surname> <given-names>M.</given-names></name> <name><surname>Bugaiska</surname> <given-names>A.</given-names></name> <name><surname>Bonin</surname> <given-names>P.</given-names></name></person-group> (<year>2013</year>). <source>The Stability-Plasticity Dilemma: Investigating the Continuum from Catastrophic Forgetting to Age-Limited Learning Effects</source>. <pub-id pub-id-type="pmid">23935590</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Osborne</surname> <given-names>M.</given-names></name> <name><surname>Lall</surname> <given-names>A.</given-names></name> <name><surname>Van Durme</surname> <given-names>B.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Exponential reservoir sampling for streaming language models,&#x0201D;</article-title> in <source>Annual Meeting of the Association for Computational Linguistics</source> (<publisher-loc>ACL</publisher-loc>), <fpage>687</fpage>&#x02013;<lpage>692</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ostapenko</surname> <given-names>O.</given-names></name> <name><surname>Rodriguez</surname> <given-names>P.</given-names></name> <name><surname>Caccia</surname> <given-names>M.</given-names></name> <name><surname>Charlin</surname> <given-names>L.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Continual learning via local module composition,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> <volume>34</volume>, <fpage>30298</fpage>&#x02013;<lpage>30312</lpage>.</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Parisi</surname> <given-names>G. I.</given-names></name> <name><surname>Kemker</surname> <given-names>R.</given-names></name> <name><surname>Part</surname> <given-names>J. L.</given-names></name> <name><surname>Kanan</surname> <given-names>C.</given-names></name> <name><surname>Wermter</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>Continual lifelong learning with neural networks: a review</article-title>. <source>Neural Netw</source>. <volume>113</volume>, <fpage>54</fpage>&#x02013;<lpage>71</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2019.01.012</pub-id><pub-id pub-id-type="pmid">30780045</pub-id></mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pomponi</surname> <given-names>J.</given-names></name> <name><surname>Scardapane</surname> <given-names>S.</given-names></name> <name><surname>Uncini</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Continual learning with invertible generative models</article-title>. <source>Neural Netw</source>. <volume>164</volume>, <fpage>606</fpage>&#x02013;<lpage>616</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2023.05.020</pub-id><pub-id pub-id-type="pmid">37244212</pub-id></mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rolnick</surname> <given-names>D.</given-names></name> <name><surname>Ahuja</surname> <given-names>A.</given-names></name> <name><surname>Schwarz</surname> <given-names>J.</given-names></name> <name><surname>Lillicrap</surname> <given-names>T.</given-names></name> <name><surname>Wayne</surname> <given-names>G.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Experience replay for continual learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> 32.</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saglam</surname> <given-names>B.</given-names></name> <name><surname>Mutlu</surname> <given-names>F. B.</given-names></name> <name><surname>Cicek</surname> <given-names>D. C.</given-names></name> <name><surname>Kozat</surname> <given-names>S. S.</given-names></name></person-group> (<year>2023</year>). <article-title>Actor prioritized experience replay</article-title>. <source>J.Artif. Intellig. Res</source>. <volume>78</volume>:<fpage>639</fpage>&#x02013;<lpage>672</lpage>. doi: <pub-id pub-id-type="doi">10.1613/jair.1.14819</pub-id></mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schulman</surname> <given-names>J.</given-names></name> <name><surname>Wolski</surname> <given-names>F.</given-names></name> <name><surname>Dhariwal</surname> <given-names>P.</given-names></name> <name><surname>Radford</surname> <given-names>A.</given-names></name> <name><surname>Klimov</surname> <given-names>O.</given-names></name></person-group> (<year>2017</year>). <article-title>Proximal policy optimization algorithms</article-title>. <source>arXiv</source> [preprint] arXiv:1707.06347. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1707.06347</pub-id></mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shin</surname> <given-names>H.</given-names></name> <name><surname>Lee</surname> <given-names>J. K.</given-names></name> <name><surname>Kim</surname> <given-names>J.</given-names></name> <name><surname>Kim</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Continual learning with deep generative replay,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> 30.</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stooke</surname> <given-names>A.</given-names></name> <name><surname>Achiam</surname> <given-names>J.</given-names></name> <name><surname>Abbeel</surname> <given-names>P.</given-names></name></person-group> (<year>2020</year>). &#x0201C;Responsive safety in reinforcement learning by pid lagrangian methods,&#x0201D;in <italic>International Conference on Machine Learning</italic> (New York: PMLR).</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>S.</given-names></name> <name><surname>Calandriello</surname> <given-names>D.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>A.</given-names></name> <name><surname>Titsias</surname> <given-names>M.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Information-theoretic online memory selection for continual learning,&#x0201D;</article-title> in <source>International Conference on Learning Representations</source>.</mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Titsias</surname> <given-names>M. K.</given-names></name> <name><surname>Schwarz</surname> <given-names>J.</given-names></name> <name><surname>Matthews</surname> <given-names>A. G.</given-names></name> d,. G <name><surname>Pascanu</surname> <given-names>R.</given-names></name> <name><surname>Teh</surname> <given-names>Y. W.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Functional regularisation for continual learning with Gaussian processes,&#x0201D;</article-title> in <source>International Conference on Learning Representations</source>.</mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Touvron</surname> <given-names>H.</given-names></name> <name><surname>Lavril</surname> <given-names>T.</given-names></name> <name><surname>Izacard</surname> <given-names>G.</given-names></name> <name><surname>Martinet</surname> <given-names>X.</given-names></name> <name><surname>Lachaux</surname> <given-names>M.-A.</given-names></name> <name><surname>Lacroix</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>LLAMA: Open and efficient foundation language models</article-title>. <source>arXiv</source> [preprint] arXiv:2302.13971.</mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tsallis</surname> <given-names>C.</given-names></name></person-group> (<year>1988</year>). <article-title>Possible generalization of boltzmann-gibbs statistics</article-title>. <source>J. Statist. Phys</source>. <volume>52</volume>, <fpage>479</fpage>&#x02013;<lpage>487</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF01016429</pub-id></mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vitter</surname> <given-names>J. S.</given-names></name></person-group> (<year>1985</year>). <article-title>Random sampling with a reservoir</article-title>. <source>ACM Trans. Mathem. Softw</source>. <volume>11</volume>, <fpage>37</fpage>&#x02013;<lpage>57</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3147.3165</pub-id></mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Q.</given-names></name> <name><surname>Ji</surname> <given-names>Z.</given-names></name> <name><surname>Pang</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name></person-group> (<year>2024</year>). <article-title>Uncertainty-aware enhanced dark experience replay for continual learning</article-title>. <source>Appl. Intellig</source>. <volume>54</volume>, <fpage>7135</fpage>&#x02013;<lpage>50</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10489-024-05488-w</pub-id></mixed-citation>
</ref>
<ref id="B58">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ye</surname> <given-names>F.</given-names></name> <name><surname>Bors</surname> <given-names>A. G.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Task-free continual learning via online discrepancy distance learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> <volume>35</volume>, <fpage>23675</fpage>&#x02013;<lpage>23688</lpage>.</mixed-citation>
</ref>
<ref id="B59">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yoo</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Wood</surname> <given-names>F.</given-names></name> <name><surname>Pleiss</surname> <given-names>G.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Layerwise proximal replay: A proximal point method for online continual learning,&#x0201D;</article-title> in International Conference on Machine Learning (New York: PMLR), <fpage>57199</fpage>&#x02013;<lpage>57216</lpage>.</mixed-citation>
</ref>
<ref id="B60">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhong</surname> <given-names>Z.</given-names></name> <name><surname>Zheng</surname> <given-names>L.</given-names></name> <name><surname>Kang</surname> <given-names>G.</given-names></name> <name><surname>Li</surname> <given-names>S.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Random erasing data augmentation,&#x0201D;</article-title> in <source>Proceedings of the AAAI Conference on Artificial Intelligence, Vol 34</source> (<publisher-loc>AAAI</publisher-loc>), <fpage>13001</fpage>&#x02013;<lpage>13008</lpage>.</mixed-citation>
</ref>
<ref id="B61">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhuo</surname> <given-names>T.</given-names></name> <name><surname>Cheng</surname> <given-names>Z.</given-names></name> <name><surname>Gao</surname> <given-names>Z.</given-names></name> <name><surname>Fan</surname> <given-names>H.</given-names></name> <name><surname>Kankanhalli</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>Continual learning with strong experience replay</article-title>. <source>arXiv</source> [preprint] arXiv:2305.13622. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2305.13622</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/498/overview">Poramate Manoonpong</ext-link>, University of Southern Denmark, Denmark</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3139054/overview">Ruijia Li</ext-link>, The Engineering Technical College of Chengdu University of Technology, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3263392/overview">Pietro Buzzega</ext-link>, University of Modena and Reggio Emilia, Italy</p>
</fn>
</fn-group>
<fn-group>
<fn id="fn0003"><label>1</label><p>Stability is generally used here rather than consolidation, but in this paper, it is avoided to distinguish from other forms of stability, such as control or learning stability.</p></fn>
</fn-group>
</back>
</article>