<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Neurosci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Computational Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Neurosci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1662-5188</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fncom.2025.1647462</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A neural network model combining the successor representation and actor-critic methods reveals effective biological use of the representation</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Tsurumi</surname> <given-names>Takayuki</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3098465"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Morita</surname> <given-names>Kenji</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/4205"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Physical and Health Education, Graduate School of Education, The University of Tokyo</institution>, <city>Tokyo</city>, <country country="jp">Japan</country></aff>
<aff id="aff2"><label>2</label><institution>International Research Center for Neurointelligence (WPI-IRCN), The University of Tokyo</institution>, <city>Tokyo</city>, <country country="jp">Japan</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Takayuki Tsurumi, <email xlink:href="mailto:ko-takayuki770@g.ecc.u-tokyo.ac.jp">ko-takayuki770@g.ecc.u-tokyo.ac.jp</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-11-26">
<day>26</day>
<month>11</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>19</volume>
<elocation-id>1647462</elocation-id>
<history>
<date date-type="received">
<day>15</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Tsurumi and Morita.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Tsurumi and Morita</copyright-holder>
<license>
<ali:license_ref start_date="2025-11-26">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>In learning goal-directed behavior, state representation is important for adapting to the environment and achieving goals. A predictive state representation called successive representation (SR) has recently attracted attention as a candidate for state representation in animal brains, especially in the hippocampus. The relationship between the SR and the animal brain has been studied, and several neural network models for computing the SR have been proposed based on the findings. However, studies on implementation of the SR involving action selection have not yet advanced significantly. Therefore, we explore possible mechanisms by which the SR is utilized biologically for action selection and learning optimal action policies. The actor-critic architecture is a promising model of animal behavioral learning in terms of its correspondence to the anatomy and function of the basal ganglia, so it is suitable for our purpose. In this study, we construct neural network models for behavioral learning using the SR. By using them to perform reinforcement learning, we investigate their properties. Specifically, we investigated the effect of using different state representations for the actor and critic in the actor-critic method, and also compared the actor-critic method with Q-learning and SARSA. We found the difference between the effect of using the SR for the actor and the effect of using the SR for the critic in the actor-critic method, and observed that using the SR in conjunction with one-hot encoding makes it possible to learn with the benefits of both representations. These results suggest the possibility that the striatum can learn using multiple state representations complementarily.</p></abstract>
<kwd-group>
<kwd>successor representation</kwd>
<kwd>actor-critic</kwd>
<kwd>neural network</kwd>
<kwd>reinforcement learning</kwd>
<kwd>striatum</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by Grants-in-Aid for Scientific Research 23K27985 and 25H02594 from Japan Society for the Promotion of Science (JSPS).</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="0"/>
<equation-count count="30"/>
<ref-count count="24"/>
<page-count count="12"/>
<word-count count="6792"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>In learning goal-directed behavior, state representation is important for adapting to the environment and achieving goals.</p>
<p>The successor representation (SR) (<xref ref-type="bibr" rid="B6">Dayan, 1993</xref>, <xref ref-type="bibr" rid="B7">2002</xref>) has recently attracted attention as a candidate for state representation in the animal brain, especially the hippocampus. The SR is a state representation based on the prediction of state transitions. Links between the SR and the animal brain have been studied. For example, it is said that assuming that animals use SR-like state representations explains the results of Tolman&#x00027;s experiments on latent learning (<xref ref-type="bibr" rid="B23">Tolman, 1948</xref>; <xref ref-type="bibr" rid="B19">Russek et al., 2017</xref>). In addition, some of the properties of hippocampal place cell activity are common to the SR (<xref ref-type="bibr" rid="B20">Stachenfeld et al., 2017</xref>). Given this background, several neural network models for computing the SR have been proposed (<xref ref-type="bibr" rid="B5">Burton et al., 2023</xref>; <xref ref-type="bibr" rid="B9">Fang et al., 2023</xref>; <xref ref-type="bibr" rid="B10">George et al., 2023</xref>). One of them (<xref ref-type="bibr" rid="B9">Fang et al., 2023</xref>) is a model that uses a recurrent neural network (RNN) in which the SR emerges as a result of the RNN&#x00027;s dynamics and the plasticity of its coupling weights. Another study (<xref ref-type="bibr" rid="B5">Burton et al., 2023</xref>) shows the mathematical equivalence of TD(&#x003BB;) learning with the SR and weight update of a spiking neural network (SNN) derived from inputs assuming hippocampal place cells and spike-timing-dependent plasticity (STDP). Another model (<xref ref-type="bibr" rid="B10">George et al., 2023</xref>) also uses STDP. All those models are considered for implementation in the hippocampus.</p>
<p>The question of how information about the external world is represented in the brain should be considered simultaneously with the question of how the brain uses it to make behavioral choices. This is because all variables related to the brain can be said to represent information from the outside world in the sense that they are influenced by sensory input, and therefore, what matters is how they are reflected in behavioral output. However, the previous studies (<xref ref-type="bibr" rid="B5">Burton et al., 2023</xref>; <xref ref-type="bibr" rid="B9">Fang et al., 2023</xref>; <xref ref-type="bibr" rid="B10">George et al., 2023</xref>) those proposed models for computing the SR did not contain experiments on ways of utilizing the SR in the brain for action selection.</p>
<p>Thus, while several studies on the biological implementation of the SR have emerged, studies on implementation involving action selection have not yet advanced significantly. Therefore, we explore possible mechanisms by which the SR is utilized biologically for action selection and learning optimal action policies.</p>
<p>The actor-critic method (<xref ref-type="bibr" rid="B3">Barto, 1995</xref>; <xref ref-type="bibr" rid="B11">Houk et al., 1995</xref>) is a reinforcement learning method often used as a model of behavioral learning in animals. It was originally devised based on physiological and anatomical findings of the basal ganglia (<xref ref-type="bibr" rid="B11">Houk et al., 1995</xref>). To this day, the actor-critic method is often used as a model for learning by the basal ganglia (<xref ref-type="bibr" rid="B13">Khamassi et al., 2005</xref>; <xref ref-type="bibr" rid="B8">Dunovan and Verstynen, 2016</xref>). It is hypothesized that the dorsolateral striatum corresponds to the Actor and the ventral striatum to the Critic (<xref ref-type="bibr" rid="B22">Takahashi et al., 2008</xref>). The actor-critic method consists of an &#x0201C;actor&#x0201D; that determines actions and a &#x0201C;critic&#x0201D; that evaluates those actions. The actor learns policies while the critic learns value functions, and state representations such as the SR can be used in the learning. It is also possible to use different state representations for the actor and critic. This corresponds to the use of different state representations in the dorsolateral striatum and the ventral striatum, according to the hypothesis regarding the striatum mentioned above. By conducting simulations where the actor and critic employ different state representations, we expect to gain insights into the utilization of multiple state representations in biological systems.</p>
<p>The actor-critic method is not the only model for behavioral learning in animals. While the actor-critic method learns value and policy separately, there are also models that learn the state-action value function representing the value of actions and determine actions based on it. Q-learning (<xref ref-type="bibr" rid="B24">Watkins and Dayan, 1992</xref>) and SARSA (<xref ref-type="bibr" rid="B18">Rummery and Niranjan, 1994</xref>) are representative methods for learning the state-action value function. It has been suggested that dopamine neurons in the ventral tegmental area (VTA) and the substantia nigra pars compacta (SNc) encode RPE for Q-learning (<xref ref-type="bibr" rid="B17">Roesch et al., 2007</xref>) and SARSA (<xref ref-type="bibr" rid="B16">Morris et al., 2006</xref>), respectively. SR can also be used to learn state-action value functions (<xref ref-type="bibr" rid="B19">Russek et al., 2017</xref>). In this case, SR based on transition probabilities between state-action pairs is employed.</p>
<p>In this study, we construct neural network models for action selection using state representations including the SR. By using the models to perform reinforcement learning, we investigate their properties. Specifically, we examine in detail the differences when using the SR for the actor, critic, or both in the actor-critic method. We also examine SARSA and Q-learning using the SR. Through these investigations, we explore possible mechanisms by which the SR is utilized biologically for action selection and learning optimal action policies.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<label>2</label>
<title>Materials and methods</title>
<sec>
<label>2.1</label>
<title>The successor representation</title>
<p>The goal of reinforcement learning is to maximize the value function:</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="double-struck"><mml:mi>E</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>Here, <italic>r</italic><sub><italic>t</italic></sub> is the reward given at time <italic>t</italic>, <italic>s</italic><sub><italic>t</italic></sub> is the state at time <italic>t</italic>, and &#x003B3; is a parameter called discount factor. &#x1D53C;<sub>&#x003C0;</sub>[&#x000B7;] means expected values when the agent acts according to a policy &#x003C0;. The value function represents the expected cumulative discounted reward.</p>
<p>The value function can be approximated with some basis functions as follows:</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>Here, <italic>V</italic> is the estimated value function, <italic><bold>w</bold></italic> is a weight vector, and <italic><bold>x</bold></italic>(<italic>s</italic>) is a feature vector representing state <italic>s</italic>. The weight <italic><bold>w</bold></italic> can be learned by standard TD learning adapted for linear function approximation:</p>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x02190;</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:msup><mml:mi>&#x003B4;</mml:mi><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>where &#x003B1;<sup><italic>w</italic></sup> is a learning rate and &#x003B4; is the TD error. The TD error is defined as</p>
<disp-formula id="EQ4"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x003B4;</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>The successor representation (SR) is a state representation based on the prediction of state transitions. The SR matrix <italic>M</italic> is defined as</p>
<disp-formula id="EQ5"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p>The rows of the SR matrix can be used as <italic><bold>x</bold></italic>(<italic>s</italic>) in <xref ref-type="disp-formula" rid="EQ2">Equation 2</xref> (<xref ref-type="bibr" rid="B19">Russek et al., 2017</xref>):</p>
<disp-formula id="EQ6"><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mstyle><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>where <inline-formula><mml:math id="M7"><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:math></inline-formula> is a component of vector <italic><bold>w</bold></italic> corresponding to a state <italic>s</italic>&#x02032;. Then, according to <xref ref-type="disp-formula" rid="EQ3">Equation 3</xref>, <italic><bold>w</bold></italic> can be learned by</p>
<disp-formula id="EQ7"><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>&#x02190;</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mi>&#x003B4;</mml:mi><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>for all states <italic>s</italic>&#x02032;.</p>
</sec>
<sec>
<label>2.2</label>
<title>Learning the successor representation</title>
<p>We use one of the methods of learning SR using neural networks proposed in a previous study (<xref ref-type="bibr" rid="B9">Fang et al., 2023</xref>).</p>
<p>A recurrent neural network (RNN) is used to compute the SR. It is assumed that the transition probability matrix <italic>T</italic> is encoded in the synaptic weights of the RNN. Then, the steady-state activity of the network in response to one-hot input <italic><bold>&#x003D5;</bold></italic> retrieves a row of the SR matrix, <italic>M</italic><sup>&#x022A4;</sup><italic><bold>&#x003D5;</bold></italic>.</p>
<p>The dynamics of the RNN is defined with the following equation:</p>
<disp-formula id="EQ8"><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mi>J</mml:mi><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003D5;</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>Here, <italic><bold>x</bold></italic> is the activity of RNN neurons, <italic>J</italic> is the weight matrix of RNN, <italic>f</italic> is an activation function, &#x003D5; is the input, and &#x003B3; is a scaling factor of recurrent activation. This dynamics leads to the steady-state activity</p>
<disp-formula id="EQ9"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mtext>ss</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>I</mml:mi><mml:mo>-</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mi>J</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003D5;</mml:mi></mml:mstyle><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>when <italic>f</italic> is the identity function.</p>
<p>The weight <italic>J</italic> is updated as follows:</p>
<disp-formula id="EQ10"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>J</mml:mi><mml:mo>&#x02190;</mml:mo><mml:mi>J</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:mi>J</mml:mi><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>where &#x003B7; is a learning rate. For each synapse,</p>
<disp-formula id="EQ11"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02190;</mml:mo><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>The first term is a temporally asymmetric potentiation term which is similar to spike-timing-dependent plasticity (STDP). The second term is a form of synaptic depotentiation, and similar inhibitory effects are known to be elements of hippocampal learning (<xref ref-type="bibr" rid="B14">Kullmann and Lamsa, 2007</xref>; <xref ref-type="bibr" rid="B15">Lamsa et al., 2007</xref>).</p>
<p>Although it works with a static learning rate, to accelerate learning, the authors introduced an adaptive learning rate calculated by <inline-formula><mml:math id="M13"><mml:mstyle mathvariant="bold-italic"><mml:mi>n</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x0003C;</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:munder><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:mo>=</mml:mo><mml:mo class="qopname">min</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> for synapses from neuron <italic>j</italic>. Modulating synaptic learning rates as a function of neural activity is consistent with experimental observations of metaplasticity (<xref ref-type="bibr" rid="B2">Abraham and Bear, 1996</xref>; <xref ref-type="bibr" rid="B1">Abraham, 2008</xref>; <xref ref-type="bibr" rid="B12">Hulme et al., 2014</xref>).</p>
<p>The authors assumed that the timescale of the plasticity is longer than the timescale of the RNN dynamics and that <italic><bold>x</bold></italic> can be regarded as converging to the steady state in the update of the weight. Under this assumption, the plasticity (<xref ref-type="disp-formula" rid="EQ10">Equation 10</xref>) leads to</p>
<disp-formula id="EQ12"><mml:math id="M14"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>J</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>where <italic>T</italic> is the transition probability matrix. <italic>T</italic> gives the probability that the agent transitions from a state <italic>s</italic> to a state <italic>s</italic>&#x02032; in one time step: <inline-formula><mml:math id="M15"><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. From <xref ref-type="disp-formula" rid="EQ9">Equations 9</xref>, <xref ref-type="disp-formula" rid="EQ12">12</xref> and</p>
<disp-formula id="EQ13"><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>I</mml:mi><mml:mo>-</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mi>T</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<p>which is derived from <xref ref-type="disp-formula" rid="EQ5">Equation 5</xref>, we obtain</p>
<disp-formula id="EQ14"><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mtext>ss</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003D5;</mml:mi></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<p>When <italic>f</italic> is a hyperbolic tangent, the steady state approximates the rows of the SR matrix, and the model becomes stable for larger &#x003B3; values compared to when <italic>f</italic> is the identity function (<xref ref-type="bibr" rid="B9">Fang et al., 2023</xref>). Therefore, we use tanh as <italic>f</italic>. As in the previous study, we use <italic><bold>x</bold></italic> after repeating the update (<xref ref-type="disp-formula" rid="EQ8">Equation 8</xref>) for <italic>t</italic><sub>max</sub> steps such that <inline-formula><mml:math id="M18"><mml:msubsup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mtext>max</mml:mtext></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x0003C;</mml:mo><mml:mn>1</mml:mn><mml:msup><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> as the steady-state activity <italic><bold>x</bold></italic><sub>ss</sub>.</p>
</sec>
<sec>
<label>2.3</label>
<title>Actor-critic</title>
<p>We adopt an actor-critic method with a policy gradient method (<xref ref-type="bibr" rid="B21">Sutton and Barto, 2018</xref>).</p>
<p>The value function is approximated by</p>
<disp-formula id="EQ15"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<p>and the weight <italic><bold>w</bold></italic> is learned by</p>
<disp-formula id="EQ16"><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x02190;</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:msup><mml:mi>&#x003B4;</mml:mi><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<p>as described in Section 2.1. The inner product in <xref ref-type="disp-formula" rid="EQ15">Equation 15</xref> is calculated by synapses whose weight represents <italic><bold>w</bold></italic> and activity of presynaptic cells represents <italic><bold>x</bold></italic>(<italic>s</italic>). The weight update <xref ref-type="disp-formula" rid="EQ16">Equation 16</xref> can be interpreted as synaptic plasticity dependent on presynaptic cell activity.</p>
<p>A policy &#x003C0; is defined with an exponential soft-max distribution:</p>
<disp-formula id="EQ17"><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>|</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B2;</mml:mi><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>b</mml:mi></mml:mrow></mml:msub></mml:mstyle><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B2;</mml:mi><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula>
<p>where &#x003C0;(<italic>a</italic>|<italic>s</italic>, <italic><bold>&#x003B8;</bold></italic>) represents the probability of choosing action <italic>a</italic> in state <italic>s</italic> parametrized by <italic><bold>&#x003B8;</bold></italic>, <italic>h</italic>(<italic>s, a</italic>, <italic><bold>&#x003B8;</bold></italic>) represents preference of action <italic>a</italic> in state <italic>s</italic> parametrized by <italic><bold>&#x003B8;</bold></italic>, and &#x003B2; is a parameter scaling the preference. For the tasks we use, which are maze tasks with 4 actions, preference <italic>h</italic> is defined as</p>
<disp-formula id="EQ18"><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msubsup><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(18)</label></disp-formula>
<p>where <italic><bold>x</bold></italic>(<italic>s</italic>) is the feature vector representing state <italic>s</italic> and</p>
<disp-formula id="EQ19"><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none none none none none none none none none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(19)</label></disp-formula>
<p>The parameter <italic>&#x003B8;</italic> is learned by a learning rule of policy gradient methods (<xref ref-type="bibr" rid="B21">Sutton and Barto, 2018</xref>):</p>
<disp-formula id="EQ20"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mo>&#x02190;</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mi>&#x003B4;</mml:mi><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mo class="qopname">ln</mml:mo><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>|</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(20)</label></disp-formula>
<p>when the agent chose action <italic>a</italic> at state <italic>s</italic>, where &#x003B1;<sub>&#x003B8;</sub> is a learning rate and &#x003B3; is a discount factor. For policies expressed by <xref ref-type="disp-formula" rid="EQ17">Equation 17</xref>, this becomes</p>
<disp-formula id="EQ21"><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02190;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B4;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(21)</label></disp-formula>
<p>when the agent chose action <italic>a</italic><sub><italic>i</italic></sub> at state <italic>s</italic>. Here, &#x003B4;<sub><italic>ik</italic></sub> is Kronecker delta. We omit &#x003B3;<sup><italic>t</italic></sup> for learning efficiency. The inner product in <xref ref-type="disp-formula" rid="EQ18">Equation 18</xref> is calculated by synapses whose weight represents <italic><bold>&#x003B8;</bold></italic> and activity of presynaptic cells represents <italic><bold>x</bold></italic>(<italic>s</italic>). The learning rule <xref ref-type="disp-formula" rid="EQ21">Equation 21</xref> can be interpreted as synaptic plasticity dependent on presynaptic and postsynaptic cell activity.</p>
<p>We use the row corresponding to state <italic>s</italic> of the SR matrix or the one-hot vector corresponding to state <italic>s</italic> as the feature vector <italic><bold>x</bold></italic>(<italic>s</italic>) in <xref ref-type="disp-formula" rid="EQ15">Equations 15</xref>, <xref ref-type="disp-formula" rid="EQ18">18</xref>. The row corresponding to state <italic>s</italic> of the SR matrix is obtained as the steady-state activity <italic><bold>x</bold></italic><sub>ss</sub> of the RNN in <xref ref-type="disp-formula" rid="EQ14">Equation 14</xref> by making <italic><bold>&#x003D5;</bold></italic> the one-hot vector corresponding to state <italic>s</italic>.</p>
<p>We refer to the use of <italic><bold>x</bold></italic> in <xref ref-type="disp-formula" rid="EQ15">Equation 15</xref> as &#x0201C;using <italic><bold>x</bold></italic> for the Critic&#x0201D; and the use of <italic><bold>x</bold></italic> in <xref ref-type="disp-formula" rid="EQ18">Equation 18</xref> as &#x0201C;using <italic><bold>x</bold></italic> for the Actor.&#x0201D; Different feature vectors can be used for the Critic and the Actor. The structures of our model with possible combinations of state representations are shown in <xref ref-type="fig" rid="F1">Figures 1A</xref>&#x02013;<xref ref-type="fig" rid="F1">D</xref>.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>The structure of the proposed model for each combination of state representations. <bold>(A)</bold> Critic: SR, Actor: SR, <bold>(B)</bold> Critic: one-hot, Actor: SR, <bold>(C)</bold> Critic: SR, Actor: one-hot, and <bold>(D)</bold> Critic: one-hot, Actor: one-hot.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-19-1647462-g0001.tif">
<alt-text content-type="machine-generated">Diagram showing four interconnected network models labeled A, B, C, and D. Each model features a central node structure with interrelated variables: phi, theta, x, w, V, and h. Models A, B, and C include a circular network of nodes with lines connecting external variables. Model D lacks the central network, displaying only external variables connected linearly.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>2.4</label>
<title>Q-learning and SARSA</title>
<p>We adopt Q-learning and SARSA as representative methods for learning the state-action value function. The state-action value function is defined by</p>
<disp-formula id="EQ22"><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="double-struck"><mml:mi>E</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(22)</label></disp-formula>
<p>The state-action value function can be approximated with some basis functions as follows:</p>
<disp-formula id="EQ23"><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(23)</label></disp-formula>
<p>Here, <italic>Q</italic> is the estimated state-action value function, <italic><bold>w</bold></italic> is a weight vector, and <italic><bold>x</bold></italic>(<italic>sa</italic>) is a feature vector representing state-action pair <italic>sa</italic>. The update of <italic><bold>w</bold></italic> in Q-learning adapted for linear function approximation is</p>
<disp-formula id="EQ24"><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x02190;</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:msup><mml:mi>&#x003B4;</mml:mi><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(24)</label></disp-formula>
<p>where</p>
<disp-formula id="EQ25"><mml:math id="M29"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x003B4;</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(25)</label></disp-formula>
<p>The update of <italic><bold>w</bold></italic> in SARSA adapted for linear function approximation is</p>
<disp-formula id="EQ26"><mml:math id="M30"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x02190;</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>w</mml:mi></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:msup><mml:mi>&#x003B4;</mml:mi><mml:mstyle mathvariant="bold-italic"><mml:mi>x</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(26)</label></disp-formula>
<p>where</p>
<disp-formula id="EQ27"><mml:math id="M31"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x003B4;</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(27)</label></disp-formula>
<p>A state-action version of the successor representation matrix is defined as</p>
<disp-formula id="EQ28"><mml:math id="M32"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mi>&#x003B3;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(28)</label></disp-formula>
<p>We use the row corresponding to state-action pair <italic>sa</italic> of the SR matrix or the one-hot vector corresponding to state-action pair <italic>sa</italic> as the feature vector <italic><bold>x</bold></italic>(<italic>sa</italic>) in <xref ref-type="disp-formula" rid="EQ23">Equation 23</xref>. Using a version of the RNN in Section 2.2 where each neuron corresponds to a state-action pair instead of a state, the row corresponding to state-action pair <italic>sa</italic> of the SR matrix is obtained as the steady-state activity <italic><bold>x</bold></italic><sub>ss</sub> of the RNN by making <italic><bold>&#x003D5;</bold></italic> the one-hot vector corresponding to state-action pair <italic>sa</italic>. The inner product in <xref ref-type="disp-formula" rid="EQ23">Equation 23</xref> is calculated by synapses whose weight represents <italic><bold>w</bold></italic> and activity of presynaptic cells represents <italic><bold>x</bold></italic>(<italic>sa</italic>).</p>
<p>We use policies similar to <xref ref-type="disp-formula" rid="EQ17">Equation 17</xref>:</p>
<disp-formula id="EQ29"><mml:math id="M33"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>|</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B2;</mml:mi><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>b</mml:mi></mml:mrow></mml:msub></mml:mstyle><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B2;</mml:mi><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(29)</label></disp-formula>
<p>We calculate the state value function by</p>
<disp-formula id="EQ30"><mml:math id="M34"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>|</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(30)</label></disp-formula>
<p>when we visualize it.</p>
</sec>
<sec>
<label>2.5</label>
<title>Parameters</title>
<p>Unless otherwise noted, the following values were used for each parameter: &#x003B3; &#x0003D; 0.8, &#x003B1;<sub><italic>w</italic></sub> &#x0003D; 0.3, &#x003B1;<sub>&#x003B8;</sub> &#x0003D; 0.3, &#x003B2; &#x0003D; 1. The initial values of <italic><bold>x</bold></italic>, <italic>J</italic>, <italic><bold>w</bold></italic>, <italic><bold>&#x003B8;</bold></italic> are a zero vector or a zero matrix.</p>
</sec>
<sec>
<label>2.6</label>
<title>Tasks</title>
<p>The tasks we used are similar to the latent learning task and the policy revaluation task in a previous study (<xref ref-type="bibr" rid="B19">Russek et al., 2017</xref>).</p>
<sec>
<label>2.6.1</label>
<title>Water maze task</title>
<p>This task is intended to examine the basic performance of the model.</p>
<p>We used a grid world without barriers, which is analogous to water mazes. Each position in the grid world is treated as a single state. There are four actions: moving up, down, left, or right. Actions toward the walls are excluded from the choices. The agent starts from the upper left corner and a reward is placed at the lower right corner. When the agent reaches the goal, one trial ends and it starts again from the upper left corner.</p>
<p>When the SR is used, the agent first learns the state representation without reward. During this, actions are randomly selected. When one-hot encoding is used, the state representation is treated as given. Then, a reward is placed and the agent learns the value function and policy. During this, the state representation is fixed.</p>
</sec>
<sec>
<label>2.6.2</label>
<title>Barrier maze task</title>
<p>We use mazes generated by the method described in Section 2.7. Actions toward the barriers are excluded from the choices. Other than that, it is the same as the water maze task.</p>
</sec>
<sec>
<label>2.6.3</label>
<title>Policy revaluation task</title>
<p>The SR is said to enable quick adaptation to changes in the environment (<xref ref-type="bibr" rid="B19">Russek et al., 2017</xref>). Therefore, in order to see the adaptability of this model to changes in the environment, we conducted an experiment in which the arrangement of rewards was changed in the middle. Punishment (negative reward) is also placed in this task.</p>
<p>We use a specific maze generated by the method described in Section 2.7.</p>
<p>When the SR is used, the agent first learns the state representation without reward. During this, actions are randomly selected. When one-hot encoding is used, the state representation is treated as given. Then, the agent learns value and action in each placement shown in order. The location marked &#x0201C;S&#x0201D; represents the starting point, the red location represents the reward location, and the blue location represents the punishment location. The agent starts from the upper left corner. When the agent reaches the reward or punishment location, it restarts from the starting point. At first, a reward is placed at the lower right corner and a negative reward is placed at the upper right corner. Training is performed for 20 reward trials in this environment. Then, the positions of reward and punishment are reversed. Training is performed for 20 reward trials in the new environment. During this, the state representation is fixed.</p>
</sec>
</sec>
<sec>
<label>2.7</label>
<title>Automatic maze generation</title>
<p>We generate barriers on a 7-by-7 grid world. Simply generating barriers randomly can result in the start and the reward location being separated by barriers, making it impossible to reach the reward. Instead of regenerating the maze if the space is divided, we adopt a generation method that avoids division. To clearly demonstrate differences in results of different learning methods and state representations, mazes are constructed by generating narrow, short paths with dead ends. Two paths of each of the lengths 2, 3, and 4 are generated in random order. Path generation is performed by extending a path randomly from a starting point and making an entrance of the path at its endpoint (the starting point becomes a dead end). When generating paths, states adjacent (including diagonally) to existing paths are excluded. First, paths whose starting point of generation is the top-left, top-right, bottom-left, and bottom-right corners are generated in order. After that, starting points of generation are randomly determined. This generation method ensures the space remains undivided.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<sec>
<label>3.1</label>
<title>Water maze task</title>
<p><xref ref-type="fig" rid="F2">Figures 2A</xref>&#x02013;<xref ref-type="fig" rid="F2">D</xref> show typical examples of the learned value function and optimal actions. The learned value function is depicted in a color map, and the direction of the action with the highest preference in each state is indicated by an arrow. We can see that learning of the value function proceeds faster when the SR is used for both the Critic and the Actor (<xref ref-type="fig" rid="F2">Figure 2A</xref>) than when one-hot encoding is used for both (<xref ref-type="fig" rid="F2">Figure 2D</xref>). Appropriate policies were learned with all four combinations of state representations.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>The results of the water maze task regarding the actor-critic method. <bold>(A&#x02013;D)</bold> Typical examples of the learned value function and optimal actions at the 50th trial. <bold>(A)</bold> Critic: SR, Actor: SR, <bold>(B)</bold> Critic: one-hot, Actor: SR, <bold>(C)</bold> Critic: SR, Actor: one-hot, <bold>(D)</bold> Critic: one-hot, Actor: one-hot, and <bold>(E)</bold> performance comparison between four combinations of state representations.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-19-1647462-g0002.tif">
<alt-text content-type="machine-generated">Four heatmaps labeled A to D depict grid-based values with arrows, indicating directional trends. A color bar ranges from zero (light) to one (dark red). Below, line graph E displays steps over trials for different methods, with distinct colored lines.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F2">Figure 2E</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times. The decreasing trends in the number of steps mean successful learning. The plots in <xref ref-type="fig" rid="F2">Figure 2E</xref> indicate that using the SR for the Actor enhances learning efficiency, while the effect of using the SR for the Critic is small in comparison.</p>
<p><xref ref-type="fig" rid="F3">Figures 3A</xref>, <xref ref-type="fig" rid="F3">B</xref> show typical examples of the learned value function and optimal actions for Q-learning and SARSA using the SR, respectively. We can see that learning of the value function proceeds faster for Q-learning than SARSA. This result is natural considering that under the same conditions, an update of <italic>Q</italic>(<italic>s, a</italic>) in Q-learning is larger than or equal to that in SARSA because <inline-formula><mml:math id="M35"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:munder><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#x02265;</mml:mo><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> in <xref ref-type="disp-formula" rid="EQ25">Equations 25</xref>, <xref ref-type="disp-formula" rid="EQ27">27</xref>.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>The results of the water maze task regarding Q-learning and SARSA. <bold>(A, B)</bold> Typical examples of the learned value function and optimal actions at the 50th trial of Q-learning <bold>(A)</bold>/SARSA <bold>(B)</bold> using the SR. <bold>(C)</bold> Performance comparison between the actor-critic method, Q-learning, and SARSA. <bold>(D, E)</bold> Typical examples of the learned value function and optimal actions at the 50th trial of Q-learning <bold>(D)</bold>/SARSA <bold>(E)</bold> using one-hot encoding. <bold>(F)</bold> Performance comparison between Q-learning using the SR and Q-learning using one-hot encoding. <bold>(G)</bold> Performance comparison between SARSA using the SR and Q-learning using one-hot encoding.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-19-1647462-g0003.tif">
<alt-text content-type="machine-generated">Panels A, B, D, and E show grid heatmaps with arrows indicating direction and color gradients representing value, from light to dark red. Panels C, F, and G are line graphs; panel C compares actor-critic, Q-learning, and SARSA methods over fifty trials, showing decreasing steps. Panels F and G compare SR and one-hot methods, indicating step reduction over trials.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F3">Figure 3C</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times, for different learning methods using the SR. As for the actor-critic method (hereafter referred to as AC), the SR is used both for the Critic and the Actor. Note that learning of the SR occasionally failed for Q-learning and SARSA, and such sessions are excluded (also for figures below). This phenomenon has been mentioned in the previous study (<xref ref-type="bibr" rid="B9">Fang et al., 2023</xref>) that proposed the model for learning SR. The frequency of this failure was less than 10 times out of 500 times. We can see that learning by AC was significantly faster than Q-learning and SARSA, while Q-learning was slightly faster than SARSA. Q-learning and SARSA showed stagnation in learning. Given that both Q-learning and SARSA correctly learned the optimal action at each state, the large number of steps to reach the reward location can be attributed to the small difference between the probability of selecting the optimal action and the probability of selecting other actions. Indeed, in locations distant from the reward location, Q-values are small, and therefore the difference between the Q-value of the optimal action and that of other actions is also expected to be small. In contrast, with AC, the difference between the preference (denoted by <italic>h</italic>) of the optimal action and that of other actions can become sufficiently large as learning progresses, even in locations distant from the reward location. The fact that Q-learning was slightly faster than SARSA is thought to be a consequence of Q-learning learning the value function faster, as mentioned earlier. The slightly faster learning speed of Q-learning compared to SARSA can be interpreted as a consequence of Q-learning learning the value function faster than SARSA.</p>
<p><xref ref-type="fig" rid="F3">Figures 3D</xref>, <xref ref-type="fig" rid="F3">E</xref> show typical examples of the learned value function and optimal actions for Q-learning and SARSA using one-hot encoding, respectively. In both cases, there is no significant difference compared to when the SR is used.</p>
<p><xref ref-type="fig" rid="F3">Figure 3F</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times, for Q-learning with the SR and Q-learning with one-hot encoding. Learning by Q-learning with the SR was slightly faster than learning by Q-learning with one-hot encoding. <xref ref-type="fig" rid="F3">Figure 3G</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times, for SARSA with the SR and SARSA with one-hot encoding. There is no significant difference in learning speed.</p>
</sec>
<sec>
<label>3.2</label>
<title>Barrier maze task</title>
<p><xref ref-type="fig" rid="F4">Figures 4A</xref>&#x02013;<xref ref-type="fig" rid="F4">D</xref> show typical examples of the learned value function and optimal actions. There is no significant difference in the learned value function across the four combinations of state representations. When the SR is used for the Actor (<xref ref-type="fig" rid="F4">Figures 4A</xref>, <xref ref-type="fig" rid="F4">B</xref>), the learned optimal actions are not appropriate at some locations in short paths with dead ends. The preference for actions in one state is influenced by the learning of actions in other states when the SR is used for the Actor, and this can result in such inappropriate policies.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>The results of the barrier maze task regarding the actor-critic method. <bold>(A&#x02013;D)</bold> Typical examples of the learned value function and optimal actions at the 50th trial. <bold>(A)</bold> Critic: SR, Actor: SR, <bold>(B)</bold> Critic: one-hot, Actor: SR, <bold>(C)</bold> Critic: SR, Actor: one-hot, <bold>(D)</bold> Critic: one-hot, Actor: one-hot, and <bold>(E)</bold> performance comparison between 4 combinations of state representations.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-19-1647462-g0004.tif">
<alt-text content-type="machine-generated">Four heatmaps labeled A, B, C, and D display directional arrows within grids, with colors ranging from light to dark red indicating value intensity. Each is accompanied by a gradient bar indicating values from 0 to 1. Below, line graph E shows performance by trial, with different algorithms represented by blue, orange, green, and red lines, indicating step counts decreasing across 50 trials.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F4">Figure 4E</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times. The decreasing trends in the number of steps mean successful learning. The plots in <xref ref-type="fig" rid="F4">Figure 4E</xref> indicate that both using the SR for the Critic and using the SR for the Actor enhance learning efficiency, with using the SR for the Actor having the greater effect.</p>
<p>As these results show, when the SR is used for the Critic and one-hot encoding is used for the Actor, appropriate policies can be learned and learning is faster than when one-hot encoding is used for both, making it a promising combination of state representations.</p>
<p><xref ref-type="fig" rid="F5">Figures 5A</xref>, <xref ref-type="fig" rid="F5">B</xref> show typical examples of the learned value function and optimal actions for Q-learning and SARSA using the SR, respectively. As in the water maze task, we can see that learning of the value function proceeds faster for Q-learning than SARSA. The learned optimal actions are appropriate even in short paths with dead ends.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>The results of the barrier maze task regarding Q-learning and SARSA. <bold>(A, B)</bold> Typical examples of the learned value function and optimal actions at the 50th trial of Q-learning <bold>(A)</bold>/SARSA <bold>(B)</bold> using the SR. <bold>(C)</bold> Performance comparison between the actor-critic method, Q-learning, and SARSA. <bold>(D, E)</bold> Typical examples of the learned value function and optimal actions at the 50th trial of Q-learning <bold>(D)</bold>/SARSA <bold>(E)</bold> using one-hot encoding. <bold>(F)</bold> Performance comparison between Q-learning using the SR and Q-learning using one-hot encoding. <bold>(G)</bold> Performance comparison between SARSA using the SR and Q-learning using one-hot encoding.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-19-1647462-g0005.tif">
<alt-text content-type="machine-generated">Panel A to E shows heat maps with arrows representing direction and a color gradient indicating value, from light to dark red. Panel C depicts a line graph comparing actor-critic, Q-learning, and SARSA methods over trials. Panels F and G show line graphs comparing SR and one-hot encoding methods over trials, with steps on the y-axis.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F5">Figure 5C</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times, for different learning methods using the SR. As for AC, the SR is used both for the Critic and the Actor. As in the water maze task, we can see that learning by AC was significantly faster than Q-learning and SARSA, while Q-learning was slightly faster than SARSA.</p>
<p><xref ref-type="fig" rid="F5">Figures 5D</xref>, <xref ref-type="fig" rid="F5">E</xref> show typical examples of the learned value function and optimal actions for Q-learning and SARSA using one-hot encoding, respectively. In both cases, there is no significant difference compared to when the SR is used.</p>
<p><xref ref-type="fig" rid="F5">Figure 5F</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times, for Q-learning with the SR and Q-learning with one-hot encoding. Learning by Q-learning with the SR was slightly faster than learning by Q-learning with one-hot encoding. <xref ref-type="fig" rid="F5">Figure 5G</xref> shows the mean and standard error of the number of steps for each trial while training sessions of 50 trials were performed 500 times, for SARSA with the SR and SARSA with one-hot encoding. There is no significant difference in learning speed.</p>
</sec>
<sec>
<label>3.3</label>
<title>Policy revaluation task</title>
<p>As previously described, in <xref ref-type="fig" rid="F6">Figure 6A</xref>, the agent learns value and action in each placement in order. Then, <xref ref-type="fig" rid="F6">Figure 6B</xref> shows the mean and standard error of the number of steps for each rewarded trial while training sessions of 40 rewarded trials were performed 500 times. Because we set an upper limit on the total number of steps per session during simulation, some runs end before reaching the reward location 20 times after reversing the reward location and the punishment location. Such runs are excluded from the mean and standard error. The number of such runs was 5 for &#x0201C;Critic: SR, Actor: SR,&#x0201D; 148 for &#x0201C;Critic: one-hot, Actor: SR,&#x0201D; 0 for &#x0201C;Critic: SR, Actor: one-hot,&#x0201D; 1 for &#x0201C;Critic: one-hot, Actor: one-hot.&#x0201D; Therefore, the actual difference between &#x0201C;Critic: one-hot, Actor: SR&#x0201D; and other combinations is larger than the plot indicates. The number of steps required to reach the reward location increases after the reversal because the new reward location is the original punishment location. As learning progresses, the number of steps required to reach it decreases. When the SR is used for the Critic and one-hot encoding is used for the Actor, the increase in steps after the reversal is smallest. The plots in <xref ref-type="fig" rid="F6">Figure 6B</xref> indicate that using the SR for the critic suppresses the increase in steps after the reversal, while using the SR for the actor results in a larger increase in steps after the reversal.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>The results of the policy revaluation task. <bold>(A)</bold> The agent learns value and policy in each placement in order. The location marked &#x0201C;S&#x0201D; represents the starting point, the red location represents the reward location, and the blue location represents the punishment location. <bold>(B)</bold> Performance comparison between four different combinations of state representations. <bold>(C&#x02013;E)</bold> Typical results when the SR is used for the Critic and one-hot encoding is used for the Actor. <bold>(C)</bold> History of rewards earned during the learning process. Negative rewards are punishment. <bold>(D, E)</bold> Learned value function and optimal actions before <bold>(D)</bold>/after <bold>(E)</bold> the reversal.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-19-1647462-g0006.tif">
<alt-text content-type="machine-generated">Diagram with five panels labeled A to E. Panel A shows two grids with different trial configurations. Panel B is a line graph of steps versus trials with multiple lines representing different models. Panel C is a scatter plot of rewards over trials. Panel D shows a grid with arrows indicating values, shaded blue to red. Panel E is similar to D, with a slightly different configuration of values and shading.</alt-text>
</graphic>
</fig>
<p>We analyze the learning process when using the SR for the Critic and one-hot encoding for the Actor, which showed the smallest increase in steps after the reversal, from perspectives other than the number of steps.</p>
<p><xref ref-type="fig" rid="F6">Figure 6C</xref> shows a typical example of the history of rewards earned during learning when the SR is used for the Critic and one-hot encoding for the Actor. In this example, the reward and punishment positions are reversed around the 30th trial. Immediately after the reversal, the probability of reaching the punishment position is higher than the probability of reaching the reward position because the policy at that time is to move toward the punishment position (= the original reward position). We can see that the probability of reaching the punishment position is decreasing again after that.</p>
<p><xref ref-type="fig" rid="F6">Figures 6D</xref>, <xref ref-type="fig" rid="F6">E</xref> show typical examples of the learned value function and optimal actions when the SR is used for the Critic and one-hot encoding for the Actor. It can be seen that after the reversal, the new value function is being learned. Unlike in the barrier maze task, the learned optimal actions lead to dead ends in some short paths. Since this occurs on paths relatively close to the punishment location, it is reasonable to consider that the cause is that the learning effect to move away from the punishment location outweighs the learning effect to move toward the reward location in some short paths.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>We constructed models in which the SR computed in the brain is used by the brain to make action choices, and performed reinforcement learning using these models. Our experiments revealed the difference between the effect of using the SR for the Critic and the effect of using the SR for the Actor in the actor-critic method.</p>
<p>The actor-critic method outperformed Q-learning and SARSA in our experiments. Furthermore, in our model with Q-learning and SARSA, the number of neurons required to represent states increases by a factor of the number of actions, resulting in high computational costs.</p>
<p>In our model with the actor-critic method, the preference for actions in one state is influenced by the learning of actions in other states, when the SR is used for the Actor. It was suggested through our experiments that this situation can have both positive and negative effects on behavioral learning. Such inter-state influence can occur not only when the SR is used as state representation or when the actor-critic method is used for learning. This may be the case in behavioral learning in animals.</p>
<p>In previous studies on actor-critic methods with neural network models (<xref ref-type="bibr" rid="B4">Barto et al., 1983</xref>; <xref ref-type="bibr" rid="B3">Barto, 1995</xref>), the value function and action preference are calculated by applying weights to the input. In short, the input is used as <italic><bold>x</bold></italic> in Section 2.3. Thus, the same <italic><bold>x</bold></italic> is used for the Critic and the Actor, and no experiments are conducted using different <italic><bold>x</bold></italic>. In our experiment, when using the SR for the Critic and one-hot encoding for the Actor, the SR and one-hot encoding complemented each other. The use of different state representations for the Critic and the Actor corresponds to the use of representations or activities in different regions of the animal brain for the evaluation of value and decision-making. It is possible that such use benefits behavioral learning in animals.</p>
<p>The model to calculate the SR (<xref ref-type="bibr" rid="B9">Fang et al., 2023</xref>) adopted in this study is supposed to be implemented in the hippocampus. Actor-critic methods are closely related to the basal ganglia (<xref ref-type="bibr" rid="B3">Barto, 1995</xref>; <xref ref-type="bibr" rid="B11">Houk et al., 1995</xref>), and it is hypothesized that the dorsolateral striatum corresponds to the Actor and the ventral striatum to the Critic (<xref ref-type="bibr" rid="B22">Takahashi et al., 2008</xref>). From this perspective, our model can be viewed as a model in which the basal ganglia perform value computation and action selection using the SR computed in the hippocampus. More specifically, the RNN part can be interpreted as a model of the hippocampus, and the neuron that represents the value function and the neurons that represent action preference can be interpreted as a model of the striatum.</p>
<p>The striatum is roughly divided into dorsal and ventral parts. The dorsal striatum is further divided into the dorsolateral striatum (or the putamen nucleus in humans) and dorsomedial striatum (or the caudate nucleus in humans), and the ventral striatum contains the nucleus accumbens. Those striatal subdivisions are connected to different cortical and subcortical structures, forming limbic (accumbal), associative (dorsomedial striatal) and sensorimotor (dorsolateral striatal) loops, respectively. In this study, we found that the combination that uses the SR for the Critic and one-hot encoding for the Actor has advantages in terms of learning accurate policies and adaptation to environmental changes. According to the hypothesis that the ventral striatum corresponds to the Critic and the dorsolateral striatum corresponds to the Actor, this combination corresponds to the ventral striatum using the SR and the dorsolateral striatum using one-hot encoding. Combining the hypothesis that SR is computed in the hippocampus (<xref ref-type="bibr" rid="B20">Stachenfeld et al., 2017</xref>) with the fact that the ventral striatum (particularly the nucleus accumbens) is in the limbic loop as mentioned above and thus receives projections from the limbic system including the hippocampus, it is indeed possible that the ventral striatum uses the SR. On the other hand, one-hot encoding is a state representation that corresponds more directly to the states than the SR. Therefore, it is suggestive that the dorsolateral striatum receives sensorimotor-related information rather than associative information. By the way, the ventral striatum is critical for goal-directed behaviors, while the dorsolateral striatum is critical for habitual behaviors (<xref ref-type="bibr" rid="B5">Burton et al., 2023</xref>). This is also consistent with the use of the SR by the ventral striatum and one-hot encoding by the dorsolateral striatum, because it is reasonable to use structured state representations such as the SR for goal-directed learning and simple state representations such as one-hot encoding for habitual learning.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The codes for simulations are available at GitHub (<ext-link ext-link-type="uri" xlink:href="https://github.com/tkyktrm/frontiers">https://github.com/tkyktrm/frontiers</ext-link>).</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>TT: Conceptualization, Formal analysis, Investigation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. KM: Conceptualization, Funding acquisition, Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s8">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abraham</surname> <given-names>W. C.</given-names></name></person-group> (<year>2008</year>). <article-title>Metaplasticity: tuning synapses and networks for plasticity</article-title>. <source>Nat. Rev. Neurosci</source>. <volume>9</volume>, <fpage>387</fpage>&#x02013;<lpage>387</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nrn2356</pub-id><pub-id pub-id-type="pmid">18401345</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abraham</surname> <given-names>W. C.</given-names></name> <name><surname>Bear</surname> <given-names>M. F.</given-names></name></person-group> (<year>1996</year>). <article-title>Metaplasticity: the plasticity of synaptic plasticity</article-title>. <source>Trends Neurosci</source>. <volume>19</volume>, <fpage>126</fpage>&#x02013;<lpage>130</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0166-2236(96)80018-X</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barto</surname> <given-names>A. G.</given-names></name></person-group> (<year>1995</year>). <article-title>&#x0201C;Adaptive critics and the basal Ganglia,&#x0201D;</article-title> in <source>Models of Information Processing in the Basal Ganglia</source>, eds. J. C. Houk, J. L. Davis, and D. G. Beiser (Cambridge, MA: The MIT Press), <fpage>215</fpage>&#x02013;<lpage>232</lpage>. doi: <pub-id pub-id-type="doi">10.7551/mitpress/4708.003.0018</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barto</surname> <given-names>A. G.</given-names></name> <name><surname>Sutton</surname> <given-names>R. S.</given-names></name> <name><surname>Anderson</surname> <given-names>C. W.</given-names></name></person-group> (<year>1983</year>). <article-title>Neuronlike adaptive elements that can solve difficult learning control problems</article-title>. <source>IEEE Trans. Syst. Man Cybern</source>. <volume>13</volume>, <fpage>834</fpage>&#x02013;<lpage>846</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TSMC.1983.6313077</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Burton</surname> <given-names>A. C.</given-names></name> <name><surname>Nakamura</surname> <given-names>K.</given-names></name> <name><surname>Roesch</surname> <given-names>M. R.</given-names></name></person-group> (<year>2015</year>). <article-title>From ventral-medial to dorsal-lateral striatum: neural correlates of reward-guided decision-making</article-title>. <source>Neurobiol. Learn. Mem.</source> <volume>117</volume>, <fpage>51</fpage>&#x02013;<lpage>59</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.nlm.2014.05.003</pub-id><pub-id pub-id-type="pmid">24858182</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dayan</surname> <given-names>P.</given-names></name></person-group> (<year>1993</year>). <article-title>Improving generalization for temporal difference learning: The successor representation</article-title>. <source>Neural Comput</source>. <volume>5</volume>, <fpage>613</fpage>&#x02013;<lpage>624</lpage>. doi: <pub-id pub-id-type="doi">10.1162/neco.1993.5.4.613</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dayan</surname> <given-names>P.</given-names></name></person-group> (<year>2002</year>). <article-title>&#x0201C;Motivated reinforcement learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, eds. T. Dietterich, S. Becker, and Z. Ghahramani (Cambridge, MA: MIT Press). doi: <pub-id pub-id-type="doi">10.7551/mitpress/1120.003.0006</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dunovan</surname> <given-names>K.</given-names></name> <name><surname>Verstynen</surname> <given-names>T.</given-names></name></person-group> (<year>2016</year>). <article-title>Believer-skeptic meets actor-critic: rethinking the role of basal ganglia pathways during decision-making and reinforcement learning</article-title>. <source>Front. Neurosci</source>. <volume>10</volume>:<fpage>106</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2016.00106</pub-id><pub-id pub-id-type="pmid">27047328</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fang</surname> <given-names>C.</given-names></name> <name><surname>Aronov</surname> <given-names>D.</given-names></name> <name><surname>Abbott</surname> <given-names>L.</given-names></name> <name><surname>Mackevicius</surname> <given-names>E. L.</given-names></name></person-group> (<year>2023</year>). <article-title>Neural learning rules for generating flexible predictions and computing the successor representation</article-title>. <source>Elife</source> <volume>12</volume>:<fpage>e80680</fpage>. doi: <pub-id pub-id-type="doi">10.7554/eLife.80680</pub-id><pub-id pub-id-type="pmid">36928104</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>George</surname> <given-names>T. M.</given-names></name> <name><surname>de Cothi</surname> <given-names>W.</given-names></name> <name><surname>Stachenfeld</surname> <given-names>K. L.</given-names></name> <name><surname>Barry</surname> <given-names>C.</given-names></name></person-group> (<year>2023</year>). <article-title>Rapid learning of predictive maps with STDP and theta phase precession</article-title>. <source>Elife</source> <volume>12</volume>:<fpage>e80663</fpage>. doi: <pub-id pub-id-type="doi">10.7554/eLife.80663</pub-id><pub-id pub-id-type="pmid">36927826</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Houk</surname> <given-names>J.</given-names></name> <name><surname>Adams</surname> <given-names>J.</given-names></name> <name><surname>Barto</surname> <given-names>A.</given-names></name></person-group> (<year>1995</year>). <article-title>A model of how the basal ganglia generate and use neural signals that predict reinforcement</article-title>. <source>Models Inf. Process. Basal Ganglia</source> <volume>13</volume>, <fpage>249</fpage>&#x02013;<lpage>270</lpage>. doi: <pub-id pub-id-type="doi">10.7551/mitpress/4708.003.0020</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hulme</surname> <given-names>S. R.</given-names></name> <name><surname>Jones</surname> <given-names>O. D.</given-names></name> <name><surname>Raymond</surname> <given-names>C. R.</given-names></name> <name><surname>Sah</surname> <given-names>P.</given-names></name> <name><surname>Abraham</surname> <given-names>W. C.</given-names></name></person-group> (<year>2014</year>). <article-title>Mechanisms of heterosynaptic metaplasticity</article-title>. <source>Philos. Trans. R. Soc. B: Biol. Sci</source>. <volume>369</volume>:<fpage>20130148</fpage>. doi: <pub-id pub-id-type="doi">10.1098/rstb.2013.0148</pub-id><pub-id pub-id-type="pmid">24298150</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khamassi</surname> <given-names>M.</given-names></name> <name><surname>Lach&#x000E8;ze</surname> <given-names>L.</given-names></name> <name><surname>Girard</surname> <given-names>B.</given-names></name> <name><surname>Berthoz</surname> <given-names>A.</given-names></name> <name><surname>Guillot</surname> <given-names>A.</given-names></name></person-group> (<year>2005</year>). <article-title>Actor-critic models of reinforcement learning in the basal ganglia: from natural to artificial rats</article-title>. <source>Adapt. Behav</source>. <volume>13</volume>, <fpage>131</fpage>&#x02013;<lpage>148</lpage>. doi: <pub-id pub-id-type="doi">10.1177/105971230501300205</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kullmann</surname> <given-names>D. M.</given-names></name> <name><surname>Lamsa</surname> <given-names>K. P.</given-names></name></person-group> (<year>2007</year>). <article-title>Long-term synaptic plasticity in hippocampal interneurons</article-title>. <source>Nat. Rev. Neurosci</source>. <volume>8</volume>, <fpage>687</fpage>&#x02013;<lpage>699</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nrn2207</pub-id><pub-id pub-id-type="pmid">17704811</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lamsa</surname> <given-names>K. P.</given-names></name> <name><surname>Heeroma</surname> <given-names>J. H.</given-names></name> <name><surname>Somogyi</surname> <given-names>P.</given-names></name> <name><surname>Rusakov</surname> <given-names>D. A.</given-names></name> <name><surname>Kullmann</surname> <given-names>D. M.</given-names></name></person-group> (<year>2007</year>). <article-title>Anti-hebbian long-term potentiation in the hippocampal feedback inhibitory circuit</article-title>. <source>Science</source> <volume>315</volume>, <fpage>1262</fpage>&#x02013;<lpage>1266</lpage>. doi: <pub-id pub-id-type="doi">10.1126/science.1137450</pub-id><pub-id pub-id-type="pmid">17332410</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Morris</surname> <given-names>G.</given-names></name> <name><surname>Nevet</surname> <given-names>A.</given-names></name> <name><surname>Arkadir</surname> <given-names>D.</given-names></name> <name><surname>Vaadia</surname> <given-names>E.</given-names></name> <name><surname>Bergman</surname> <given-names>H.</given-names></name></person-group> (<year>2006</year>). <article-title>Midbrain dopamine neurons encode decisions for future action</article-title>. <source>Nat. Neurosci</source>. <volume>9</volume>, <fpage>1057</fpage>&#x02013;<lpage>1063</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nn1743</pub-id><pub-id pub-id-type="pmid">16862149</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Roesch</surname> <given-names>M. R.</given-names></name> <name><surname>Calu</surname> <given-names>D. J.</given-names></name> <name><surname>Schoenbaum</surname> <given-names>G.</given-names></name></person-group> (<year>2007</year>). <article-title>Dopamine neurons encode the better option in rats deciding between differently delayed or sized rewards</article-title>. <source>Nat. Neurosci</source>. <volume>10</volume>, <fpage>1615</fpage>&#x02013;<lpage>1624</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nn2013</pub-id><pub-id pub-id-type="pmid">18026098</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rummery</surname> <given-names>G. A.</given-names></name> <name><surname>Niranjan</surname> <given-names>M.</given-names></name></person-group> (<year>1994</year>). <source>On-line Q-Learning Using Connectionist Systems, Vol. 37</source>. <publisher-loc>Cambridge</publisher-loc>: <publisher-name>University of Cambridge, Department of Engineering</publisher-name>.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Russek</surname> <given-names>E. M.</given-names></name> <name><surname>Momennejad</surname> <given-names>I.</given-names></name> <name><surname>Botvinick</surname> <given-names>M. M.</given-names></name> <name><surname>Gershman</surname> <given-names>S. J.</given-names></name> <name><surname>Daw</surname> <given-names>N. D.</given-names></name></person-group> (<year>2017</year>). <article-title>Predictive representations can link model-based reinforcement learning to model-free mechanisms</article-title>. <source>PLoS Comput. Biol</source>. <volume>13</volume>:<fpage>e1005768</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pcbi.1005768</pub-id><pub-id pub-id-type="pmid">28945743</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stachenfeld</surname> <given-names>K. L.</given-names></name> <name><surname>Botvinick</surname> <given-names>M. M.</given-names></name> <name><surname>Gershman</surname> <given-names>S. J.</given-names></name></person-group> (<year>2017</year>). <article-title>The hippocampus as a predictive map</article-title>. <source>Nat. Neurosci</source>. <volume>20</volume>, <fpage>1643</fpage>&#x02013;<lpage>1653</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nn.4650</pub-id><pub-id pub-id-type="pmid">28967910</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sutton</surname> <given-names>R. S.</given-names></name> <name><surname>Barto</surname> <given-names>A. G.</given-names></name></person-group> (<year>2018</year>). <source>Reinforcement Learning: An Introduction</source>, 2nd Edn. Cambridge, MA: MIT press.</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Takahashi</surname> <given-names>Y.</given-names></name> <name><surname>Schoenbaum</surname> <given-names>G.</given-names></name> <name><surname>Niv</surname> <given-names>Y.</given-names></name></person-group> (<year>2008</year>). <article-title>Silencing the critics: understanding the effects of cocaine sensitization on dorsolateral and ventral striatum in the context of an actor/critic model</article-title>. <source>Front. Neurosci</source>. <volume>2</volume>:<fpage>282</fpage>. doi: <pub-id pub-id-type="doi">10.3389/neuro.01.014.2008</pub-id><pub-id pub-id-type="pmid">18982111</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tolman</surname> <given-names>E. C.</given-names></name></person-group> (<year>1948</year>). <article-title>Cognitive maps in rats and men</article-title>. <source>Psychol. Rev</source>. <volume>55</volume>:<fpage>189</fpage>. doi: <pub-id pub-id-type="doi">10.1037/h0061626</pub-id><pub-id pub-id-type="pmid">18870876</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Watkins</surname> <given-names>C. J.</given-names></name> <name><surname>Dayan</surname> <given-names>P.</given-names></name></person-group> (<year>1992</year>). <article-title>Q-learning</article-title>. <source>Mach. Learn</source>. <volume>8</volume>, <fpage>279</fpage>&#x02013;<lpage>292</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF00992698</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/85801/overview">Toshiaki Omori</ext-link>, Kobe University, Japan</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/367271/overview">Keiji Miura</ext-link>, Kwansei Gakuin University, Japan</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/332243/overview">Yutaka Sakai</ext-link>, Tamagawa University, Japan</p>
</fn>
</fn-group>
</back>
</article>