<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2026.1738140</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Spike-based Q-learning in a non-von Neumann architecture</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Shin</surname> <given-names>Donghyuk</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<uri xlink:href="https://loop.frontiersin.org/people/2826392"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jo</surname> <given-names>Hyeongcheol</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Jang</surname> <given-names>Hyeseung</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Jeong</surname> <given-names>Yoo Ho</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Jeong</surname> <given-names>YeonJoo</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<uri xlink:href="https://loop.frontiersin.org/people/1183058"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kwak</surname> <given-names>Joon Young</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<uri xlink:href="https://loop.frontiersin.org/people/530319"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Park</surname> <given-names>Jongkil</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/14425"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lee</surname> <given-names>Suyoun</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<uri xlink:href="https://loop.frontiersin.org/people/1241074"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kim</surname> <given-names>Inho</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/2877377"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Park</surname> <given-names>Jong-Keuk</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<uri xlink:href="https://loop.frontiersin.org/people/1241763"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Park</surname> <given-names>Seongsik</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<uri xlink:href="https://loop.frontiersin.org/people/1272975"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jang</surname> <given-names>Hyun Jae</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<uri xlink:href="https://loop.frontiersin.org/people/2937199"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lee</surname> <given-names>Hyung-Min</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<uri xlink:href="https://loop.frontiersin.org/people/1312510"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Kim</surname> <given-names>Jaewook</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/1243514"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Korea University</institution>, <city>Seoul</city>, <country country="kp">Republic of Korea</country></aff>
<aff id="aff2"><label>2</label><institution>Korea Institute of Science and Technology (KIST)</institution>, <city>Seoul</city>, <country country="kp">Republic of Korea</country></aff>
<aff id="aff3"><label>3</label><institution>LG Electronics Inc</institution>, <city>Seoul</city>, <country country="kp">Republic of Korea</country></aff>
<aff id="aff4"><label>4</label><institution>Ewha Womans University</institution>, <city>Seoul</city>, <country country="kp">Republic of Korea</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Jaewook Kim, <email xlink:href="mailto:jaewookk@kist.re.kr">jaewookk@kist.re.kr</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-03">
<day>03</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>20</volume>
<elocation-id>1738140</elocation-id>
<history>
<date date-type="received">
<day>03</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>23</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Shin, Jo, Jang, Jeong, Jeong, Kwak, Park, Lee, Kim, Park, Park, Jang, Lee and Kim.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Shin, Jo, Jang, Jeong, Jeong, Kwak, Park, Lee, Kim, Park, Park, Jang, Lee and Kim</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-03">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Non-von Neumann architectures overcome the memory-compute separation of von Neumann systems by distributing computation and memory locally, thereby reducing data-transfer bottlenecks and power consumption. These features are particularly advantageous for reinforcement learning (RL) workloads that rely on frequent value-function updates across large state-action spaces. When combined with event-driven spiking neural networks (SNNs), non-von Neumann architectures can further improve overall computational efficiency by leveraging the sparse nature of spike-based processing. In this study, we propose a hardware-feasible SNN-based non-von Neumann architecture that performs Q-learning, one of the most widely known reinforcement learning algorithms. The proposed architecture maps states and actions to individual neurons using one-hot encoding and locally stores each state&#x02013;action pair&#x00027;s <italic>Q</italic>-value in the corresponding synapse. To enable each synapse to update its local <italic>Q</italic>-value based on the next state maximum <italic>Q</italic> stored in other synapses, a neuron group connected through a lateral inhibition structure is employed to produce the maximum <italic>Q</italic>, which is then globally transmitted to all synapses. A delay circuit is also added to align the next-state and current-state values to ensure temporally consistent updates. Each synapse locally generates a learning selection signal and combines it with the globally transmitted signals to update only the target synapse. The proposed architecture was validated through simulations on the Cart-pole benchmark, showing stable learning performance under low-bit precision and achieving comparable accuracy to software-based Q-learning with sufficient bit precision.</p></abstract>
<kwd-group>
<kwd>non-von Neumann architecture</kwd>
<kwd>neuromorphic architecture</kwd>
<kwd>SNN</kwd>
<kwd>reinforcement learning</kwd>
<kwd>Q-learning</kwd>
<kwd>cart-pole</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
 <funding-source id="sp1">
 <institution-wrap>
 <institution>Korea Institute of Science and Technology</institution>
 <institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100003693</institution-id>
 </institution-wrap>
 </funding-source>
 <award-id rid="sp1">2E33560</award-id>
</award-group>
<award-group id="gs2">
 <funding-source id="sp2">
 <institution-wrap>
 <institution>National Research Foundation of Korea</institution>
 <institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100003725</institution-id>
 </institution-wrap>
 </funding-source>
 <award-id rid="sp2">2021M3F3A2A01037808</award-id>
</award-group>
 <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported in part by the Korea Institute of Science and Technology (KIST) under Grants 2E33560 and 2E33721, in part by the Institute of Information &#x00026; Communications Technology Planning &#x00026; Evaluation (IITP) funded by the Korea government (Ministry of Science and ICT, MSIT) (RS-2025-02217259), and in part by the National R&#x00026;D Program through the National Research Foundation of Korea (NRF) funded by MSIT (2021M3F3A2A01037808).</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="4"/>
<equation-count count="2"/>
<ref-count count="28"/>
<page-count count="11"/>
<word-count count="7910"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Neuromorphic Engineering</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Reinforcement learning (RL) provides a computational framework in which an agent learns optimal policies by interacting with the environment and receiving feedback in the form of rewards (<xref ref-type="bibr" rid="B21">Sutton and Barto, 2015</xref>). RL has been widely adopted in domains such as robotics, Internet of Things (IoT) systems, smart grid energy management, and communication systems, which are characterized by stringent power and latency constraints as well as the need to process large-scale streaming data efficiently (<xref ref-type="bibr" rid="B20">Span&#x000F2; et al., 2019</xref>). To meet these requirements, researchers have focused on enhancing the computational efficiency of RL algorithms. Parallel hardware acceleration platforms, including general-purpose GPUs (<xref ref-type="bibr" rid="B24">Tiwari et al., 2025</xref>), field-programmable gate arrays (FPGAs; <xref ref-type="bibr" rid="B25">Tran et al., 2022</xref>; <xref ref-type="bibr" rid="B16">Salomo et al., 2025</xref>), and custom accelerators (<xref ref-type="bibr" rid="B20">Span&#x000F2; et al., 2019</xref>), have shown substantial improvements in processing speed. Nevertheless, such approaches still exhibit much lower energy efficiency than biological neural systems, highlighting a substantial gap between artificial and biological computation (<xref ref-type="bibr" rid="B27">Yamazaki et al., 2022</xref>).</p>
<p>As an alternative to close this gap, spiking neural networks (SNNs)&#x02014;a bio-plausible third-generation neural model&#x02014;have attracted considerable attention (<xref ref-type="bibr" rid="B22">Taherkhani et al., 2020</xref>; <xref ref-type="bibr" rid="B14">Mehonic and Kenyon, 2022</xref>; <xref ref-type="bibr" rid="B10">Kiselev et al., 2025</xref>). Due to their event-driven nature, SNNs remain largely inactive in the absence of spikes, thereby enabling highly energy-efficient computation. However, executing SNN-based algorithms on conventional von Neumann architectures still suffers from computational delays and energy overhead caused by sequential memory access and control logic bottlenecks (<xref ref-type="bibr" rid="B8">Ha&#x0015F;egan et al., 2022</xref>; <xref ref-type="bibr" rid="B12">Liu and Pan, 2023</xref>; <xref ref-type="bibr" rid="B11">Liu et al., 2023</xref>; <xref ref-type="bibr" rid="B19">Siddique et al., 2023</xref>).</p>
<p>Neuromorphic processors such as Intel&#x00027;s Loihi (<xref ref-type="bibr" rid="B6">Davies et al., 2018</xref>), Stanford&#x00027;s Neurogrid (<xref ref-type="bibr" rid="B4">Benjamin et al., 2014</xref>), and IBM&#x00027;s TrueNorth (<xref ref-type="bibr" rid="B3">Akopyan et al., 2015</xref>) were developed to support spike-based computation. These architectures mitigate the structural bottlenecks of von Neumann systems and demonstrate the feasibility of large-scale spike-based processing with improved energy efficiency. Recent studies have successfully implemented RL algorithms, including Deep Q-Networks (DQN) and Deep Deterministic Policy Gradient (DDPG), on the Loihi platform, thus demonstrating their potential for real-time, low-power learning (<xref ref-type="bibr" rid="B23">Tang et al., 2020</xref>; <xref ref-type="bibr" rid="B2">Akl et al., 2021</xref>; <xref ref-type="bibr" rid="B28">Zanatta et al., 2023</xref>).</p>
<p>Despite remarkable progress in neuromorphic hardware, SNN processors are not yet fully non-von Neumann architectures due to programming requirements for general-purpose functionality. For example, Loihi employs programmable virtual synaptic connections to configure neural networks with reconfigurable connectivity. Once spikes are transmitted into a core, a sequence of operations within the core&#x02014;including the identification of target neurons, retrieval and update of the associated neuronal and synaptic data from memory, and storage of results&#x02014;causes computational latency. Parallelization across multiple cores can alleviate memory-access delays compared with conventional von Neumann architectures; however eliminating memory-search operations altogether would enable even greater energy efficiency.</p>
<p>In this work, we propose a non-von Neumann architecture that performs Q-learning&#x02014;a well-established reinforcement learning algorithm&#x02014;based on SNNs. States and actions are one-hot encoded into input and output neurons, respectively, and the synapses between them are hardwired with a fixed topology such that each synapse locally stores and updates the <italic>Q</italic>(<italic>S, A</italic>) value through an up/down counter. This enables Q-table updates to be executed directly through spike events without requiring complex memory search or control logic, thereby reducing bottlenecks and improving energy efficiency.</p>
<p>A key challenge in this architecture is the distributed storage of <italic>Q</italic>-values across synapses, which complicates simultaneous access to both the <italic>Q</italic>(<italic>S, A</italic>) of the current state and the maximum <italic>Q</italic>(<italic>S</italic>&#x02032;, <italic>a</italic>) of the next state. Spatially, these values are stored in different local synapses and therefore are not directly accessible, while temporally, they do not coexist immediately after a state transition. Furthermore, because the target synapse for update is not predetermined, globally transmitting the maximum <italic>Q</italic>(<italic>S</italic>&#x02032;, <italic>a</italic>) risks unintended simultaneous updates across multiple synapses.</p>
<p>This challenge is addressed by proposing three architectural mechanisms. First, a population of neurons is designed to compute the maximum <italic>Q</italic>(<italic>S</italic>&#x02032;, <italic>a</italic>) in the next state through a lateral inhibition structure, and the resulting spikes are subsequently distributed globally. Second, spikes encoding the <italic>Q</italic>(<italic>S, A</italic>) of the selected action in the current state are temporally delayed via a delay circuit to ensure their co-occurrence with the maximum <italic>Q</italic>(<italic>S</italic>&#x02032;, <italic>a</italic>) spikes at the same time instance. Third, because spikes representing the current state and the selected action&#x00027;s <italic>Q</italic>-value are delivered simultaneously only to their corresponding synapses, their coincidence generates a selection signal that enables synapse-specific updates even in the presence of globally broadcast signals. These mechanisms enable each synapse to independently perform Q-learning updates without additional memory or address lookups.</p>
<p>The hardware feasibility of the proposed architecture is demonstrated through simulations in the cart-pole environment, a widely used reinforcement learning benchmark. The learning performance is further evaluated by varying the synaptic memory precision from 2 to 5 bits, allowing identification of the minimum precision required to sustain learning and the bit-width necessary to achieve performance comparable to conventional Q-learning. Such analysis provides practical insights into the trade-off between resource efficiency and learning performance in neuromorphic hardware implementations.</p></sec>
<sec id="s2">
<label>2</label>
<title>Background</title>
<p>Q-learning is a type of off-policy Temporal Difference (TD) learning, in which the value of the current state is updated using the estimated value of the next state. In off-policy learning, the behavior policy, which determines the agent&#x00027;s actions, is separated from the target policy that the agent aims to optimize. In Q-learning, the behavior policy is typically implemented using an epsilon-greedy policy, where an action is selected at random with probability &#x003B5;, and the action that maximizes the reward is selected with probability 1&#x02212;&#x003B5;. The target policy, in contrast, follows a greedy policy that consistently selects the action associated with the highest <italic>Q</italic>-value.</p>
<p>The goal of Q-learning is to enable an agent to interact with its environment and learn an optimal policy that determines the best action <italic>A</italic> to take in each state <italic>S</italic>. The agent iteratively estimates the state&#x02013;action value function <italic>Q</italic>(<italic>S, A</italic>), which facilitates the selection of optimal actions with respect to the current state. The Q-learning update rule is given by</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>A</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02190;</mml:mo><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>A</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>R</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>A</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where &#x003B1; &#x02208; (0, 1] is the learning rate, which determines the extent to which newly obtained information overrides previously acquired estimates. <italic>R</italic> is the immediate reward received after taking action <italic>A</italic> in state <italic>S</italic>. The discount factor &#x003B3; &#x02208; [0, 1] determines the relative importance of future rewards. The term <inline-formula><mml:math id="M2"><mml:msub><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the maximum estimated value of the next state <italic>S</italic>&#x02032;. The <italic>Q</italic>-value is updated after the agent performs an action <italic>A</italic> in the current state <italic>S</italic>, interacts with the environment, and subsequently observes the next state <italic>S</italic>&#x02032; together with the reward <italic>R</italic>.</p></sec>
<sec id="s3">
<label>3</label>
<title>Method</title>
<sec>
<label>3.1</label>
<title>SNN architecture for Q-learning</title>
<sec>
<label>3.1.1</label>
<title>State-action mapping and policy implementation</title>
<p><xref ref-type="fig" rid="F1">Figure 1A</xref> shows the proposed non-von Neumann architecture implementing SNN-based Q-learning, and <xref ref-type="fig" rid="F1">Figure 1B</xref> illustrates the waveforms that represent the operation of architecture. States and actions are mapped to individual leaky integrate-and-fire (LIF) neurons (<xref ref-type="bibr" rid="B1">Abbott, 1999</xref>), enabling a direct mapping between state-action space and neural representation. The neurons representing states and those representing actions are fully connected, and the synapses between them correspond to the Q-table, with each synaptic weight encoding <italic>Q</italic>(<italic>S, A</italic>). Each state neurons <italic>S</italic><sub><italic>n</italic></sub> (<italic>n</italic> &#x0003D; 1, 2, &#x02026;, <italic>p</italic>) represents one element of the state set <italic>S</italic> &#x0003D; {<italic>s</italic><sub>1</sub>, <italic>s</italic><sub>2</sub>, &#x022EF;&#x02009;, <italic>s</italic><sub><italic>p</italic></sub>}, and each action neuron <italic>A</italic><sub><italic>m</italic></sub> (<italic>m</italic> &#x0003D; 1, 2, &#x02026;, <italic>q</italic>) represents one element of the action set <italic>A</italic> &#x0003D; {<italic>a</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>, &#x022EF;&#x02009;, <italic>a</italic><sub><italic>q</italic></sub>}.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p><bold>(A)</bold> Block diagram of the proposed non-von Neumann SNN architecture for Q-learning. <bold>(B)</bold> Operation waveforms of the proposed architecture for three states (<italic>p</italic> = 3) and two actions (<italic>q</italic> = 2). As the state transitions through <italic>s</italic><sub>1</sub>, <italic>s</italic><sub>2</sub> and <italic>s</italic><sub>3</sub>, the state spikes <italic>S</italic><sub><italic>n</italic></sub>(<italic>t</italic>) are generated. Depending on <italic>S</italic><sub><italic>n</italic></sub>(<italic>t</italic>) and the exploration signal <italic>E</italic><sub><italic>m</italic></sub>(<italic>t</italic>), which is randomly activated according to &#x003B5;<sub>&#x003C0;</sub>, <italic>A</italic><sub><italic>m</italic></sub>(<italic>t</italic>) exhibits a firing frequency representing <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>). The delayed signal <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>) reflects <italic>A</italic><sub><italic>m</italic></sub>(<italic>t</italic>) shifted by &#x003C4;<sub><italic>d</italic></sub>. Independent of <italic>E</italic><sub><italic>m</italic></sub>(<italic>t</italic>), &#x003B3;<italic>a</italic><sub><italic>m</italic></sub>(<italic>t</italic>) generates spikes corresponding to the maximum <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>). Based on environmental feedback, either (<italic>t</italic>) or (<italic>t</italic>) fires, and upon each state transition, &#x003B1;(<italic>t</italic>) produces a pulse of duration &#x003C4;<sub>&#x003B1;</sub>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1738140-g0001.tif">
<alt-text content-type="machine-generated">Diagram illustrating a proposed non-von Neumann architecture implementing SNN-based Q-learning. In figure (A), states and actions are mapped to leaky integrate-and-fire neurons, with synaptic weights encoding Q(S, A). A one-hot encoder activates the corresponding state neuron, whose spikes are transmitted to action neurons. Action selection follows an epsilon-greedy policy, combining lateral inhibition for exploitation and an exploration circuit controlled by ep through a MUX. Additional. neurons and synaptic delays enable access to both current and next Q-values. Figure (B) shows example spike waveforms, illustrating state transitions, action neuron firing under exploitation and exploration, delayed spikes, and the temporal overlap used for Q-learning updates.</alt-text>
</graphic>
</fig>
<p>The observed state <italic>S</italic> from the environment is one-hot encoded (<xref ref-type="bibr" rid="B18">Seger, 2018</xref>), producing a binary one-hot signal in which only the element corresponding to <italic>S</italic> is set to &#x0201C;1,&#x0201D; whereas all others are set to &#x0201C;0.&#x0201D; The resulting vector activates the corresponding state neuron <italic>S</italic><sub><italic>n</italic></sub>, which in turn generates spikes transmitted to the entire population of action neurons. Under the epsilon-greedy policy, action neuron <italic>A</italic><sub><italic>m</italic></sub> emits spikes with a firing rate proportional to <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>), determined by either exploitation or exploration.</p>
<p>In the proposed architecture, exploitation is implemented through a lateral inhibition structure, in which the outputs of the action neurons mutually suppress one another, allowing only the action neuron associated with the highest <italic>Q</italic> to become active. Exploration is implemented through the circuit shown in <xref ref-type="fig" rid="F1">Figure 1A</xref>, where a discrete random variable <italic>X</italic> selects one element from the action set <italic>A</italic> &#x0003D; {<italic>a</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>, &#x022EF;&#x02009;, <italic>a</italic><sub><italic>q</italic></sub>} with uniform probability whenever the state changes. The selected value is provided as input to a one-hot encoder, which converts it into a digital parallel signal. The encoder output is then processed through an inverter, and the inverted signal is combined with the spikes generated by the MUX via an AND gate, resulting in either spikes or 0. These combined spikes suppress the action neurons before lateral inhibition takes effect, thereby allowing only the neuron corresponding to <italic>X</italic> to remain active and emit spikes proportional to <italic>Q</italic>.</p>
<p>The balance between exploitation and exploration is determined by the discrete random variable &#x003B5;<sub>&#x003C0;</sub>, which takes the value 0 with probability 1&#x02212;&#x003B5; and 1 with probability &#x003B5;. When &#x003B5;<sub>&#x003C0;</sub> &#x0003D; 0, the MUX output is 0, and the architecture operates in exploitation mode without suppression of the action neurons by the AND gates. Conversely, when &#x003B5;<sub>&#x003C0;</sub> &#x0003D; 1, the MUX output generates spikes that pass through the AND gates and suppresses all but one action neuron, thereby enabling exploration. The outputs of the action neurons are subsequently transmitted to a selection module that identifies the action neuron with the highest firing frequency and delivers the corresponding action <italic>A</italic> to the environment.</p>
<p><xref ref-type="fig" rid="F1">Figure 1B</xref> illustrates the spiking activity of the state neurons in response to state transitions and the spiking of the action neurons as determined by the value of &#x003B5;<sub>&#x003C0;</sub> for <italic>p</italic> = 3 and <italic>q</italic> = 2. The state changes asynchronously in the order of <italic>s</italic><sub>1</sub>, <italic>s</italic><sub>2</sub>, <italic>s</italic><sub>3</sub>, causing the corresponding <italic>S</italic><sub>1</sub>, <italic>S</italic><sub>2</sub>, and <italic>S</italic><sub>3</sub> neurons to fire sequentially. After each state transition, <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) is immediately updated and details on this <italic>Q</italic> update process are provided in Section 3.2. For <italic>s</italic><sub>1</sub> and <italic>s</italic><sub>3</sub>, where exploitation is applied, the <italic>A</italic><sub>2</sub> and <italic>A</italic><sub>1</sub> neurons fire according to the highest <italic>Q</italic>, <italic>Q</italic>(<italic>s</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>) and <italic>Q</italic>(<italic>s</italic><sub>3</sub>, <italic>a</italic><sub>1</sub>), respectively. In contrast, for <italic>s</italic><sub>2</sub>, where exploration is applied, the <italic>A</italic><sub>1</sub> neuron fires despite <italic>Q</italic>(<italic>s</italic><sub>2</sub>, <italic>a</italic><sub>2</sub>) being higher than <italic>Q</italic>(<italic>s</italic><sub>2</sub>, <italic>a</italic><sub>1</sub>), due to the suppression of the <italic>A</italic><sub>2</sub> neuron by the <italic>E</italic><sub>2</sub> spikes.</p></sec>
<sec>
<label>3.1.2</label>
<title>Spike encoding for Q-learning updates</title>
<p>To adapt Q-learning updates for SNN operation, the elements of <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref>, which include the reward <italic>R</italic>, <inline-formula><mml:math id="M3"><mml:mi>&#x003B3;</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, and <italic>Q</italic>(<italic>S, A</italic>), are encoded as spike signals whose firing frequencies are proportional to their respective values and delivered to the target synapses. The learning rate &#x003B1; is represented as a pulse whose width is proportional to the corresponding value and is transmitted to all synapses. The encoding and delivery of these transformed signals are illustrated in the red dashed box in <xref ref-type="fig" rid="F1">Figure 1B</xref>.</p>
<p>Spike signals with firing frequencies proportional to <italic>Q</italic>(<italic>S, A</italic>) and <inline-formula><mml:math id="M4"><mml:msub><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> can be generated by introducing LIF neurons driven by these terms. In the proposed architecture, however, the firing frequency of each action neuron is inherently proportional to <italic>Q</italic>(<italic>S, A</italic>). Therefore, spike signals representing <italic>Q</italic>(<italic>S, A</italic>) can be obtained directly from the existing action neurons without the need for additional circuitry (<xref ref-type="fig" rid="F1">Figure 1A</xref>).</p>
<p>When exploration is applied under the epsilon-greedy policy, spike signals with firing frequencies proportional to <inline-formula><mml:math id="M5"><mml:msub><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is not obtainable from the action neurons. To address this limitation, the proposed architecture incorporates additional &#x003B3;<italic>a</italic><sub><italic>m</italic></sub> neurons (<xref ref-type="fig" rid="F1">Figure 1A</xref>). These neurons share the same synaptic connections as the action neurons but implement only the lateral inhibition structure associated with exploitation. The scaling factor &#x003B3; is determined by adjusting the thresholds of the &#x003B3;<italic>a</italic><sub><italic>m</italic></sub> neurons, and their firing frequencies vary in proportion to &#x003B3;.</p>
<p>For the Q-learning update, both the <italic>Q</italic>-value of the current state and that of the next state are required to be simultaneously available. However, because only the current <italic>Q</italic>-value is stored in synapses, the <italic>Q</italic>-value of the next state becomes available only after the state transition. To resolve this issue, the proposed architecture incorporates a delay mechanism that enables the coexistence of the current and next <italic>Q</italic>-values within the next state by delaying the <italic>A</italic><sub><italic>m</italic></sub> spikes corresponding to <italic>Q</italic>(<italic>S, A</italic>). Specifically, the spike signals <italic>A</italic><sub><italic>m</italic></sub>(<italic>t</italic>), which represent to <italic>Q</italic>(<italic>S, A</italic>), are delayed by a fixed time interval &#x003C4;<sub><italic>d</italic></sub> to generate <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>). This delayed signal ensures that, during the next state, spikes representing <italic>Q</italic>(<italic>S, A</italic>) remain available for a duration of &#x003C4;<sub><italic>d</italic></sub>.</p>
<p>The outputs of each <italic>A</italic><sub><italic>m</italic></sub> neuron are delayed individually, such that spikes corresponding to <italic>Q</italic>(<italic>S, A</italic>) can be delivered to the synapses of the same <italic>A</italic><sub><italic>m</italic></sub> neuron during the next state, thereby enabling the Q-learning update. The outputs of the &#x003B3;<italic>a</italic> neurons are combined using an OR gate to form a single &#x003B3;(<italic>t</italic>) signal, which is then delivered to all synapses.</p>
<p>In <xref ref-type="fig" rid="F1">Figure 1B</xref>, spikes corresponding to <italic>Q</italic>(<italic>S, A</italic>) and spikes corresponding to <inline-formula><mml:math id="M6"><mml:mi>&#x003B3;</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> coexist for a duration of &#x003C4;<sub><italic>d</italic></sub> immediately following a state transition. For example, after the transition from <italic>s</italic><sub>1</sub> to <italic>s</italic><sub>2</sub>, <italic>A</italic><sub><italic>d</italic>2</sub> spikes appear with a firing frequency proportional to <italic>Q</italic>(<italic>s</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>), while, within the same interval, &#x003B3;<italic>a</italic><sub>2</sub> spikes emerge with a frequency proportional to <italic>Q</italic>(<italic>s</italic><sub>2</sub>, <italic>a</italic><sub>2</sub> ).</p>
<p>The reward signal <italic>R</italic>, positive for rewards and negative for penalties, is converted into spikes without sign information by introducing two additional neurons (<xref ref-type="fig" rid="F1">Figure 1A</xref>): an <italic>R</italic> neuron for rewards and a <italic>P</italic> neuron for penalties. For each state, when a reward occurs, only the <italic>P</italic> neuron emits spikes with a frequency proportional to the reward magnitude, whereas when a penalty occurs, only the <italic>P</italic> neuron emits spikes with a frequency proportional to the penalty magnitude. The spikes from theses neurons are delivered to all synapses to drive the Q-learning update.</p>
<p>The spike signals corresponding to the terms in <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref> coexist only during a limited interval &#x003C4;<sub><italic>d</italic></sub> after a state transition, which defines the effective update window in the next state. In the proposed architecture, the learning rate &#x003B1; is implemented by an &#x003B1; generator that produces a pulse of width &#x003C4;<sub>&#x003B1;</sub>, proportional to &#x003B1; and bounded by &#x003C4;<sub><italic>d</italic></sub>. This pulse is triggered at each state transition and only spikes occurring within the &#x003C4;<sub>&#x003B1;</sub> window contribute to the Q-learning updates. A smaller &#x003C4;<sub>&#x003B1;</sub> results in fewer spikes being involved in the computation. As illustrated in <xref ref-type="fig" rid="F1">Figure 1B</xref>, when &#x003C4;<sub>&#x003B1;</sub> &#x0003C; &#x003C4;<sub><italic>d</italic></sub>, only <italic>R, P</italic>, <italic>A</italic><sub><italic>dm</italic></sub> and &#x003B3;<italic>a</italic><sub><italic>m</italic></sub> spikes within the &#x003C4;<sub>&#x003B1;</sub> window are utilized for learning.</p></sec>
<sec>
<label>3.1.3</label>
<title>Spike-based synaptic update circuit for Q-learning</title>
<p>As shown in <xref ref-type="fig" rid="F1">Figure 1A</xref>, <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>), <italic>R</italic>(<italic>t</italic>), <italic>P</italic>(<italic>t</italic>), and &#x003B3;<italic>a</italic>(<italic>t</italic>) are delivered globally to all synapses. <italic>S</italic><sub><italic>n</italic></sub>(<italic>t</italic>) is transmitted only to the synapses connected to the specific state neuron <italic>S</italic><sub><italic>n</italic></sub>, while <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>) is transmitted exclusively to the synapses connected to the selected action neuron <italic>A</italic><sub><italic>m</italic></sub>. Consequently, both signals are simultaneously present only at synapses where the current state and the currently selected action are jointly represented. The proposed architecture exploits this structural feature to generate a selection signal based on these two inputs, which in turn determines the Q-learning update.</p>
<p><xref ref-type="fig" rid="F2">Figure 2A</xref> shows the block diagram of an individual synapse in the proposed architecture, which performs the computations required for Q-learning update and stores the <italic>Q</italic>-values. As Q-learning updates occur during the &#x003C4;<sub><italic>d</italic></sub> period of the next state, the <italic>S</italic><sub><italic>n</italic></sub> spikes are delayed by &#x003C4;<sub><italic>d</italic></sub> to remain valid within this interval. After the occurrence of <italic>S</italic><sub><italic>dn</italic></sub> spikes, the subsequent arrival of <italic>A</italic><sub><italic>dm</italic></sub> spikes generate the eligibility trace.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p><bold>(A)</bold> Block diagram of a synaptic circuit performing local updates of the weight corresponding to <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>), where the delayed state and delayed action spikes generate an eligibility trace that is combined with update-related inputs to produce <italic>LTP</italic><sub><italic>nm</italic></sub>/<italic>LTD</italic><sub><italic>nm</italic></sub> spikes driving the counter-based <italic>Q</italic>-value update. <bold>(B)</bold> Operation waveforms of the eligibility trace generator and the waveform conversion of the trace using a buffer. The eligibility trace is generated when an <italic>A</italic><sub><italic>dm</italic></sub> spike occurs within &#x003C4;<sub><italic>s</italic></sub> after an <italic>S</italic><sub><italic>dn</italic></sub> spike, and this trace is converted into the <italic>ET</italic><sub><italic>nm</italic></sub> pulse of duration &#x003C4;<sub><italic>etw</italic></sub> through a buffer with a threshold <italic>V</italic><sub><italic>th</italic></sub>. <bold>(C)</bold> Operation waveforms illustrating the <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) update process based on signals transmitted to and generated within the synaptic block. <italic>ET</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) pulses are generated when the delayed state signal <italic>S</italic><sub><italic>dn</italic></sub>(<italic>t</italic>), obtained by shifting <italic>S</italic><sub><italic>n</italic></sub>(<italic>t</italic>) by &#x003C4;<sub><italic>d</italic></sub>, coincides with <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>). When the yellow-shaded &#x003B1;(<italic>t</italic>) pulse overlaps with <italic>ET</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) pulses, <italic>LTP</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) spikes are induced by (<italic>t</italic>) and &#x003B3;<italic>a</italic>(<italic>t</italic>), whereas <italic>LTD</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) spikes are induced by <italic>P</italic>(<italic>t</italic>) and <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>). Each <italic>LTP</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) and <italic>LTD</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) spike updates <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) by a single step.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1738140-g0002.tif">
<alt-text content-type="machine-generated">Diagram illustrating the synaptic learning mechanism used for spike-based Q-learning updates. Figure (A) shows the block diagram of an individual synapse, including logic gates, delay elements, an eligibility trace generator, and an up/down counter that stores Q(Sn,am). Delayed state spikes and action spikes jointly generate an eligibility trace that defines when learning is enabled. Figures (B) and (C) present time-based waveforms of key signals, including state, action, reward, penalty, and .modulated signals. These waveforms illustrate the generation of LTP and LTD spikes and the resulting real-time increase or decrease of synaptic Q-values during the effective update window.</alt-text>
</graphic>
</fig>
<p>The eligibility trace generator can be realized using a capacitor&#x02013;MOSFET structure, in which capacitors integrate incoming spikes and discharge gradually through leakage, while a MOSFET gates the signal according to the resulting voltage. This circuit configuration produces a decaying trace that represents synaptic eligibility (<xref ref-type="bibr" rid="B26">Wijekoon and Dudek, 2011</xref>). The resulting trace is converted by a buffer into an <italic>ET</italic><sub><italic>nm</italic></sub> pulse of duration &#x003C4;<sub><italic>etw</italic></sub> (<xref ref-type="fig" rid="F2">Figure 2B</xref>), defining the time window in which learning is valid. As illustrated in <xref ref-type="fig" rid="F2">Figure 2C</xref>, <italic>ET</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) is generated at the synapse corresponding to <italic>Q</italic>(<italic>S, A</italic>) during the &#x003C4;<sub><italic>d</italic></sub> period of the next state. After the transition from <italic>s</italic><sub>1</sub> to <italic>s</italic><sub>2</sub>, the <italic>S</italic><sub><italic>d</italic>1</sub> spikes from the previous state <italic>s</italic><sub>1</sub> and the <italic>A</italic><sub><italic>d</italic>2</sub> spikes generate an <italic>ET</italic><sub>12</sub> pulse that remains HIGH for the duration of &#x003C4;<sub><italic>d</italic></sub>, enabling only <italic>Q</italic>(<italic>s</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>) to be updated in state <italic>s</italic><sub>2</sub>.</p>
<p>The up/down counter is employed to store <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) values and to update them using spike-based signals (<xref ref-type="fig" rid="F2">Figure 2A</xref>). The input spikes are generated by classifying the signals in <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref> into those that increase <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) and those that decrease it, and combining each group with logic gates. Specifically, (<italic>t</italic>) and &#x003B3;(<italic>t</italic>) are grouped for potentiation, and <italic>P</italic>(<italic>t</italic>) and <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>) are grouped for depression, with each pair combined through OR gates. The outputs of the OR gates are subsequently gated by (<italic>t</italic>) and <italic>ET</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) using AND operations, producing <italic>LTP</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) and <italic>LTD</italic><sub><italic>nm</italic></sub>(<italic>t</italic>) signals.</p>
<p>The up/down counter receives <italic>LTP</italic><sub><italic>nm</italic></sub> spikes at its up input, increasing <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) by one count per spike, and <italic>LTD</italic><sub><italic>nm</italic></sub> spikes at its down input, decreasing <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) by one count per spike. <xref ref-type="fig" rid="F2">Figure 2C</xref> shows the synaptic updates of <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) driven by spikes in the proposed architecture. Within the time window where the &#x003B1; pulse and the <italic>ET</italic><sub><italic>nm</italic></sub> pulse coexist, <italic>LTP</italic><sub><italic>nm</italic></sub> spikes are generated from the combination of <italic>R</italic> spikes and &#x003B3;<italic>a</italic> spikes, while <italic>LTD</italic><sub><italic>nm</italic></sub> spikes arise from the combination of <italic>P</italic> spikes and <italic>A</italic><sub><italic>dm</italic></sub> spikes. Each occurrence of an <italic>LTP</italic><sub><italic>nm</italic></sub> spike results in a real-time increase in <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>), whereas each <italic>LTD</italic><sub><italic>nm</italic></sub> spike results in a real-time decrease.</p></sec></sec>
<sec>
<label>3.2</label>
<title>Cart-pole task environment</title>
<p>The cart-pole task, illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>, is a standard benchmark in reinforcement learning where a force is applied to a cart along the x-axis on a flat surface with the goal of maintaining the pole balanced on the cart (<xref ref-type="bibr" rid="B7">Geva and Sitte, 1993</xref>). In this study, simulations were conducted using the cart-pole environment provided in the Reinforcement Learning Toolbox of MATLAB. Each episode was initialized with the cart positioned at the origin and the pole in an upright orientation. At every 20 ms time step, a force of either &#x0002B;10 N or 10 N was applied to the cart. An episode terminates in failure if the cart position exceeds &#x000B1;2.4 units from the origin or if the pole angle exceeds &#x000B1;12 &#x000B0;. Conversely, an episode is considered successful if the pole remained balanced within these bounds for 4 s.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Cart-pole game environment.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1738140-g0003.tif">
<alt-text content-type="machine-generated">Diagram of an inverted pendulum on a cart. The cart is on wheels, moving along a horizontal line marked with -2.4, 0, and 2.4. An arrow denotes force (F) applied to the right. The pendulum is at an angle &#x003B8; from the vertical position.</alt-text>
</graphic>
</fig>
<p>The state variables of the cart-pole environment are the cart position <italic>x</italic>, cart velocity &#x01E8B;, pole angle &#x003B8;, and pole angular velocity <inline-formula><mml:math id="M7"><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mo>.</mml:mo></mml:mover></mml:math></inline-formula>. These variables were quantized as follows:</p>
<disp-formula id="E2"><mml:math id="M8"><mml:mtable columnalign="right"><mml:mtr><mml:mtd><mml:mi>x</mml:mi><mml:mo>:</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mn>2</mml:mn><mml:mo>.</mml:mo><mml:mn>4</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>.</mml:mo><mml:mn>4</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>;</mml:mo><mml:mi>&#x01E8B;</mml:mi><mml:mo>:</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x0221E;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x0221E;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>;</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>&#x003B8;</mml:mi><mml:mo>:</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mn>12</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mo>-</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>-</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>01</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>01</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>0</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>01</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>01</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>12</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>;</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mo>.</mml:mo></mml:mover><mml:mo>:</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x0221E;</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mo>-</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>87</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>87</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>87</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>87</mml:mn><mml:mo>,</mml:mo><mml:mi>&#x0221E;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The state set <italic>S</italic> consists of 19 elements, comprising 18 four-dimensional tuples from the combinations of the state variables and one failure state of the cart-pole task. Within the action set <italic>A</italic> = {&#x02212;10 N, 10 N}, the proposed architecture contained 38 synapses encoding the corresponding <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) values. For non-failure states <italic>s</italic><sub>1</sub>- <italic>s</italic><sub>18</sub>, a reward of &#x0002B;1 is assigned, whereas for the failure state <italic>s</italic><sub>19</sub>, a penalty of &#x02212;8 is applied. The parameter &#x003B5; in the epsilon-greedy policy, which determines the probability of exploitation and exploration, was initialized at 1 and decays by a factor of 0.7 across episodes.</p></sec></sec>
<sec id="s4">
<label>4</label>
<title>Experiments &#x00026; results</title>
<p>To evaluate the operation of the proposed non-von Neumann architecture in a hardware-oriented context, a high-level simulation model was implemented in MATLAB and interfaced with the cart-pole environment. All simulations were performed on a workstation with an Intel(R) Core&#x02122; i7-8700 CPU &#x00040; 3.20 GHz and 16 GB of RAM.</p>
<p>In the simulations, the model parameters were set as follows: the learning rate &#x003B1; = 1, the discount factor &#x003B3; = 0.99, a counter bit-width of 3 bits, a reward of &#x0002B;1, and a penalty of &#x02212;8. The firing frequency of the state neurons was fixed at 10 kHz, whereas the action neurons fired at frequencies ranging from 201 to 1,610 Hz depending on the <italic>Q</italic>-values stored in their corresponding synapses. The eligibility trace window &#x003C4;<sub>e<italic>tw</italic></sub> was set to 14 ms to ensure that, at the lowest action neuron frequency of 201 Hz, the trace generated by an <italic>A</italic><sub><italic>dm</italic></sub> spike persisted in the buffer until the next spike arrived. A detailed summary of the simulation parameters is provided in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Model parameters used in the simulation of the proposed architecture.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Parameter</bold></th>
<th valign="top" align="center"><bold>Value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Bit-width</td>
<td valign="top" align="center">3-bit</td>
</tr>
<tr>
<td valign="top" align="left"><italic>S</italic><sub><italic>n</italic></sub> freq (Hz)</td>
<td valign="top" align="center">10k</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C4;<sub><italic>d</italic></sub> (ms)</td>
<td valign="top" align="center">5</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C4;<sub>&#x003B1;</sub> (ms)</td>
<td valign="top" align="center">5</td>
</tr>
<tr>
<td valign="top" align="left">Reward freq (Hz)</td>
<td valign="top" align="center">205</td>
</tr>
<tr>
<td valign="top" align="left">Penalty freq (Hz)</td>
<td valign="top" align="center">1,700</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C4;<sub><italic>etw</italic></sub> (ms)</td>
<td valign="top" align="center">14</td>
</tr></tbody>
</table>
</table-wrap>
<p>Based on these parameters, we evaluated the proposed architecture in the cart-pole environment across 100 episodes. <xref ref-type="fig" rid="F4">Figure 4</xref> shows the simulated waveforms of Episodes 1, 30, and 100. The signals <italic>R</italic>(<italic>t</italic>), <italic>P</italic>(<italic>t</italic>), &#x003B3;<italic>a</italic>(<italic>t</italic>), and <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>) denote spike trains over time, whereas <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub>1</sub>) and <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub>2</sub>) represent the corresponding <italic>Q</italic>-values, updated in time and quantized to 3 bits. The colors of the <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub>1</sub>) and <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub>2</sub>) traces are matched to those of the corresponding <italic>S</italic><sub><italic>n</italic></sub> spikes to indicate correspondence.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Simulation results of the cart-pole task for episodes 1, 30, and 100, showing failures at 0.18 s and 1.94 s in episode 1 and 30, and successful balance at 4 s in episode 100. Each panel shows the learning rate pulse (<italic>t</italic>), the reward spikes (<italic>t</italic>), the penalty spikes <italic>P</italic>(<italic>t</italic>), the state spikes <italic>S</italic><sub><italic>n</italic></sub>(<italic>t</italic>) for n = 1, 2, &#x02026;, 19, the action spikes <italic>A</italic><sub><italic>m</italic></sub>(<italic>t</italic>) for <italic>m</italic> = 1, 2, and the 3-bit quantized <italic>Q</italic>-values <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>), represented using integer levels from 1 to 8. The <italic>Q</italic>-value trajectories are shown in separate <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub>1</sub>) and <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub>2</sub>) panels, with each panel corresponding to a different subset of states (<italic>s</italic><sub>1</sub><italic>s</italic><sub>6</sub>, <italic>s</italic><sub>7</sub><italic>s</italic><sub>12</sub>, and <italic>s</italic><sub>13</sub><italic>s</italic><sub>19</sub>).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1738140-g0004.tif">
<alt-text content-type="machine-generated">Three panels labeled Episode 1, Episode 30, and Episode 100 display multiple line graphs. Each episode shows stacked rows of graphs, with Episode 1 having mostly flat lines and sparse colors, while Episodes 30 and 100 exhibit more varied, dynamic patterns with multiple colors. The graphs indicate changes in parameters like (S_n(t)), (A_m(t)), and (Q(s_n, a_m)) over time. The x-axis shows time in seconds, increasing from left to right, with notable differences in data across episodes, suggesting progression or learning over time.</alt-text>
</graphic>
</fig>
<p>In episode 1, the <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) values were initialized to their maximum. Since all <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) values were identical at the start, most of them changed only slightly during learning. However, once the state transitioned to the failure state <italic>s</italic><sub>19</sub>, <italic>Q</italic>(<italic>s</italic><sub>18</sub>, <italic>a</italic><sub>1</sub>) decreased sharply in response to the <italic>P</italic> spikes, leading to the termination of the episode.</p>
<p>In episode 30, the initial <italic>Q</italic>-values reflected the learning accumulated from previous episodes. Within the green-shaded interval between 0.96 s and 1.44 s, the state transitioned from <italic>s</italic><sub>9</sub> to <italic>s</italic><sub>2</sub>, with the action <italic>A</italic><sub>1</sub> selected in both states. At <italic>s</italic><sub>9</sub>, the <italic>Q</italic>(<italic>s</italic><sub>9</sub>, <italic>a</italic><sub>1</sub>), shown by the thick blue trace, corresponds to <italic>Q</italic>(<italic>S, A</italic>), whereas at <italic>s</italic><sub>2</sub>, the <italic>Q</italic>(<italic>s</italic><sub>2</sub>, <italic>a</italic><sub>1</sub>), shown by the thick orange line, corresponds to <inline-formula><mml:math id="M9"><mml:msub><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>.The Q-learning update defined in <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref> was executed, causing <italic>Q</italic>(<italic>s</italic><sub>9</sub>, <italic>a</italic><sub>1</sub>) to decrease by four steps. Similar to episode 1, episode 30 also terminated when the state reached the failure state <italic>s</italic><sub>19</sub> at 1.44 s.</p>
<p>In episode 100, the simulation terminated successfully after maintaining balance for the full 4 s without entering the failure state <italic>s</italic><sub>19</sub>. The <italic>Q</italic>(<italic>s</italic><sub><italic>n</italic></sub>, <italic>a</italic><sub><italic>m</italic></sub>) had stabilized and, apart from minor deviations of approximately one step following updates, remain largely unchanged from their prior values.</p>
<p>The performance of the proposed architecture was evaluated by averaging scores every 20 episodes across 10 independent simulation runs. The score increased by 1 for every 20 ms in which the pole remained balanced, reaching a maximum of 200 when balance was maintained for 4 s.</p>
<p>The red traces in <xref ref-type="fig" rid="F5">Figure 5A</xref> show the average score per 20 episodes for each of the 10 simulations conducted with a 3-bit counter, while the black trace shows the mean of these averages across simulations. Although individual runs vary due to exploration governed by the epsilon-greedy policy, the results indicate that the average score reaches 200 within 100 episodes.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p><bold>(A)</bold> Learning curves obtained using a 3-bit counter in the proposed architecture. Red lines indicate the average score per 20 episodes for each of the 10 trials, and the black line shows the overall mean. <bold>(B)</bold> Comparison of average score per 20 episodes across different counter bit-widths: 5-bit (green), 4-bit (orange), 3-bit (pink), and 2-bit (yellow), and standard Q-learning (blue). The solid lines show the average score per 20 episodes over 10 trials, and the shaded area represent the standard deviation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1738140-g0005.tif">
<alt-text content-type="machine-generated">Two line graphs show the average score per 20 episodes over 100 episodes in a cart-pole task. In graph (A), red traces represent the results of 10 simulation runs using a 3-bit counter, and the black trace shows their mean, which reaches a score of 200 within 100 episodes despite variability caused by e-greedy exploration. Graph (B) compares learning curves across different counter bit-widths (2-bit, 3-bit, 4-bit, and 5-bit) and conventional Q-learning. The curves show differing learning speeds and final performance levels, with some configurations reaching an average score of 200 earlier than others.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F5">Figure 5B</xref> compares the average score per 20 episodes across 10 simulations with &#x003B1; = 1 and &#x003B3; = 0.99, under the counter bit-widths of 2, 3, 4, and 5, as well as conventional Q-learning without bit limitations. The experimental parameters for each counter bit configuration are summarized in <xref ref-type="table" rid="T2">Table 2</xref>. In this experiment, the parameters for each bit-width configuration were selected to ensure stable operation of the architecture. The reward was fixed at the minimum unit of &#x0002B;1, while the penalty was set to the maximum negative value representable by each bit-width. Furthermore, the frequencies of the reward and penalty signals were adjusted so that the number of spikes associated with each value was appropriately reflected within the maximum valid time window &#x003C4;<sub><italic>d</italic></sub>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Parameters for different counter bit widths in the proposed architecture.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Parameter</bold></th>
<th valign="top" align="center" colspan="4"><bold>Value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Bit-width</td>
<td valign="top" align="center">2-bit</td>
<td valign="top" align="center">3-bit</td>
<td valign="top" align="center">4-bit</td>
<td valign="top" align="center">5-bit</td>
</tr>
<tr>
<td valign="top" align="left"><italic>S</italic><sub><italic>n</italic></sub> freq (Hz)</td>
<td valign="top" align="center">10k</td>
<td valign="top" align="center">10k</td>
<td valign="top" align="center">20k</td>
<td valign="top" align="center">40k</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C4;<sub><italic>d</italic></sub> (ms)</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">8</td>
<td valign="top" align="center">10</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C4;<sub>&#x003B1;</sub> (ms)</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">8</td>
<td valign="top" align="center">10</td>
</tr>
<tr>
<td valign="top" align="left">Reward freq (Hz)</td>
<td valign="top" align="center">505</td>
<td valign="top" align="center">205</td>
<td valign="top" align="center">127</td>
<td valign="top" align="center">105</td>
</tr>
<tr>
<td valign="top" align="left">Penalty freq (Hz)</td>
<td valign="top" align="center">2,200</td>
<td valign="top" align="center">1,700</td>
<td valign="top" align="center">2,050</td>
<td valign="top" align="center">3,250</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C4;<sub><italic>etw</italic></sub> (ms)</td>
<td valign="top" align="center">17</td>
<td valign="top" align="center">14</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">9</td>
</tr></tbody>
</table>
</table-wrap>
<p>With the 2-bit counter (yellow trace), the cart-pole task failed as the average score does not reach 200. The 3-bit counter (pink trace) achieved success approximately 40 episodes later than conventional Q-learning (blue trace), whereas the 4-bit (orange trace) and 5-bit (green trace) counters reached an average score of 200 within about 50 episodes, comparable to Q-learning. These results demonstrate that the proposed architecture can successfully solve the cart-pole task with a 3-bit counter, while performance comparable to Q-learning is obtained with a 4-bit counter.</p>
<p>The performance graph in <xref ref-type="fig" rid="F5">Figure 5B</xref>, generated using the parameters listed in <xref ref-type="table" rid="T2">Table 2</xref>, was analyzed using a one-way analysis of variance (ANOVA), and the results are summarized in <xref ref-type="table" rid="T3">Table 3</xref>. A statistically significant effect of quantization level on performance was observed [<sub>(4, 45)</sub> = 60.0544, <italic>p</italic>-value &#x0003C; 0.0001], encompassing unquantized Q-learning and 2&#x02013;5-bit representations. Subsequently, Tukey&#x00027;s honestly significant difference (HSD) <italic>post-hoc</italic> tests were performed to compare Q-learning with each bit-width and the results are presented in <xref ref-type="table" rid="T4">Table 4</xref>. <italic>Post-hoc</italic> analyses revealed no significant differences between Q-learning and the 5-bit, 4-bit, or 3-bit models (all <italic>p</italic>-values &#x02265; 0.987). In contrast, the 2-bit condition showed significantly lower performance compared with Q-learning (<italic>p</italic>-value &#x0003C; 0.0001).</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>One-way ANOVA across quantization levels.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Source</bold></th>
<th valign="top" align="center"><bold>SS</bold></th>
<th valign="top" align="center"><bold><italic>df</italic></bold></th>
<th valign="top" align="center"><bold>MS</bold></th>
<th valign="top" align="center"><bold><italic>F</italic>-value</bold></th>
<th valign="top" align="center"><bold><italic>p</italic>-value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Quantization level</td>
<td valign="top" align="center">160,810</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">40,204</td>
<td valign="top" align="center">60.0544</td>
<td valign="top" align="center">&#x0003C; 0.0001</td>
</tr>
<tr>
<td valign="top" align="left">Error</td>
<td valign="top" align="center">30,125</td>
<td valign="top" align="center">45</td>
<td valign="top" align="center">669.45</td>
<td/>
<td/>
</tr>
<tr>
<td valign="top" align="left">Total</td>
<td valign="top" align="center">190,940</td>
<td valign="top" align="center">49</td>
<td valign="top" align="center">8</td>
<td/>
<td/>
</tr></tbody>
</table>
</table-wrap>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Tukey&#x00027;s HSD <italic>post-hoc</italic> comparisons between Q-learning and models with different bit-widths.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Comparison</bold></th>
<th valign="top" align="center"><bold>Mean diff</bold></th>
<th valign="top" align="center"><bold>95%</bold></th>
<th valign="top" align="center"><bold><italic>p</italic>-value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Q-learning&#x02212;5-bit</td>
<td valign="top" align="center">&#x02212;0.0360</td>
<td valign="top" align="center">[&#x02212;32.9148, 32.8428]</td>
<td valign="top" align="center">1.0000</td>
</tr>
<tr>
<td valign="top" align="left">Q-learning&#x02212;4-bit</td>
<td valign="top" align="center">0.2700</td>
<td valign="top" align="center">[&#x02212;32.6088, 33.1488]</td>
<td valign="top" align="center">1.0000</td>
</tr>
<tr>
<td valign="top" align="left">Q-learning&#x02212;3-bit</td>
<td valign="top" align="center">5.7375</td>
<td valign="top" align="center">[&#x02212;27.1413, 38.6163]</td>
<td valign="top" align="center">0.9874</td>
</tr>
<tr>
<td valign="top" align="left">Q-learning&#x02212;2-bit</td>
<td valign="top" align="center">143.1677</td>
<td valign="top" align="center">[110.2889, 176.0465]</td>
<td valign="top" align="center">&#x0003C; 0.0001</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec sec-type="discussion" id="s5">
<label>5</label>
<title>Discussion</title>
<p>In this study, we proposed a non-von Neumann SNN architecture specialized for the Q-learning algorithm. The proposed system employs a hard-wired connectivity with a fixed network topology, in which each synapse stores a single <italic>Q</italic>-value, thereby reducing memory-access overhead through localized storage. This architectural approach contrasts with general-purpose neuromorphic processors such as Intel&#x00027;s Loihi, which adopt reconfigurable neural connectivity to support various network topologies but typically involve centralized or shared memory access, potentially leading to memory-access bottlenecks within the core. In this context, this work emphasizes algorithm-hardware co-optimization rather than hardware reconfigurability, suggesting a promising direction for improving computational efficiency. This approach aligns with prior studies emphasizing the need for co-design across multiple levels of neuromorphic systems&#x02014;including hardware, circuits, algorithms, and applications (<xref ref-type="bibr" rid="B17">Schuman et al., 2022</xref>)&#x02014;and suggests the potential of algorithm-centered hardware specialization as a direction for future neuromorphic hardware development.</p>
<p>These architectural differences are reflected in the energy efficiency and area characteristics. In term of energy efficiency, the synaptic weights in Loihi are stored in SRAM, and each spike is processed through AER address decoding, synapse selection, memory access, and a read&#x02013;modify&#x02013;write update, with the spike delivered as a packet across the on-chip network. While this packet-based event-driven approach is highly efficient for sparse activity, the dynamic power consumed per spike can increase as spike events are transmitted in packet form. In contrast, in the proposed architecture, each <italic>Q</italic>-value is stored in a local counter and spikes are routed directly through fixed wiring, thereby avoiding packet conversion and address decoding and reducing the amount of data movement and the activation of update-related circuitry.</p>
<p>In terms of area, Loihi is designed such that the neurons and synapses within each core share a common computation and learning engine, whereas in the proposed architecture, dedicated processing units and local learning circuits are assigned to each neuron and synapse block. As a result, Loihi can achieve a relatively higher neuron and synapse density per unit area. However, when the full system architecture is considered, Loihi includes additional blocks such as the network on chip (NoC), AER interface logic for packet processing, and routers, which contribute non-negligibly to the overall chip area. By comparison, although separate blocks for the NoC and packet-based routing are not required in the proposed architecture, additional area overhead arises from the increased wiring needed for the fixed connectivity between neuron and synapse blocks. The practical impact of these factors in implementation will require further examination and careful evaluation.</p>
<p>Another notable aspect of the proposed architecture is its alignment with biological learning processes observed in the brain. In the proposed system, distributed computation occurs locally at each synapse, global reward signals are broadcast throughout the network, and synapse-specific learning is achieved through local signal generation&#x02014;analogous to the interplay between global modulatory signals and local synaptic events in the brain. In the brain, slow global signals such as hormones or neuromodulators regulate long-term learning, while local spike interactions at specific synapses drive plasticity (<xref ref-type="bibr" rid="B5">Brzosko et al., 2019</xref>). Similarly, the proposed architecture globally propagates both the reward and <inline-formula><mml:math id="M10"><mml:msub><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, and generates local selection signals through the coincidence of pre- and post-synaptic spikes corresponding to state&#x02013;action pairs. Moreover, the delay mechanism introduced to address temporal mismatches aligns with biological timing characteristics. Neural systems exhibit axonal conduction delays (<xref ref-type="bibr" rid="B13">Madadi Asl et al., 2017</xref>), synaptic transmission delays, and recurrent-circuit delays, all of which play crucial roles in learning mechanisms such as spike-timing-dependent plasticity (STDP). These similarities suggest that the proposed non-von Neumann architecture captures key functional aspects of biological learning mechanisms.</p>
<p>From a hardware perspective, this study demonstrated that a minimal 3-bit precision up/down counter used as a synaptic memory was sufficient to complete the cart-pole simulation within 100 episodes, confirming the feasibility of low-precision memory in practical learning. As the architecture scales, the number of synapses (<italic>p</italic>&#x000D7;<italic>q</italic>) grows much faster than the number of neurons (<italic>p</italic>&#x0002B;2<italic>q</italic>), making synaptic memory bit width and area efficiency critical constraints in hardware design. Therefore, the finding that stable learning can be achieved with as few as 3 bits supports the practical feasibility of implementing the proposed architecture on neuromorphic hardware.</p>
<p>Beyond precision considerations, it is also important to assess whether the proposed architecture remains robust when scaled to larger network sizes. In conventional von Neumann systems, <italic>Q</italic>-values are stored in centralized memory, requiring frequent memory accesses and substantial data movement during learning. Consequently, memory bottlenecks have been a major limitation when such systems are scaled. In contrast, in the proposed architecture, <italic>Q</italic>-values are stored in local counters within each synapse block, and learning is carried out in parallel across synapses, so that memory-related bottlenecks do not arise structurally during scaling.</p>
<p>A remaining concern in large-scale expansion is whether propagation delays along long signal routes could introduce timing mismatches in learning. In the proposed architecture, learning is based on counting spikes within an &#x003B1; pulse of duration &#x003C4;<sub>&#x003B1;</sub>, with a maximum timing tolerance defined by &#x003C4;<sub><italic>d</italic></sub>. Global update-related signals&#x02014;such as <italic>S</italic><sub><italic>n</italic></sub>(<italic>t</italic>), <italic>A</italic><sub><italic>dm</italic></sub>(<italic>t</italic>), &#x003B3;<italic>a</italic>(<italic>t</italic>), <italic>R</italic>(<italic>t</italic>), and <italic>P</italic>(<italic>t</italic>)&#x02014;are routed across synapse blocks through wires of varying lengths. Differences in wire lengths can introduce arrival-time variations, which may affect the number of spikes captured within the &#x003B1; pulse and lead to non-uniform Q-updates across the network. In the presented 3-bit implementation, update-related signals operate at a maximum frequency of 10 kHz, such that a 1% timing variation corresponds to approximately 1 &#x003BC;s. In 16&#x02013;22 nm technology nodes, the reported propagation delay is about 2 ns per millimeter [International Technology Roadmap for Semiconductors (ITRS), <xref ref-type="bibr" rid="B9">2007</xref>]. At this rate, a delay of 1 &#x003BC;s would accumulate only over wire lengths exceeding approximately 555 mm, which is far beyond the dimensions of a typical single chip. Even in multi-chip board-level configurations, substantial margin therefore remains before routing-induced delays would meaningfully affect learning behavior.</p>
<p>In addition to these hardware-level considerations, large-scale Q-learning presents challenges, particularly in terms of slower convergence and reduced generalization when the state&#x02013;action space becomes very large. As the number of states increases, experience becomes sparsely distributed across the space, reducing opportunities for repeated correction of specific situations. This sparsity slows learning and can lead to generalization errors in which the agent assigns inaccurate <italic>Q</italic>-values to insufficiently explored states. As the dimensionality of the environment grows, these challenges become more severe, often requiring substantially more interactions to achieve stable learning outcomes. In future work, large-scale simulations may be used to evaluate the impact of update sparsity on performance, and concepts inspired by similarity-based update approaches (<xref ref-type="bibr" rid="B15">Rosenfeld et al., 2017</xref>) may be incorporated to ensure that related state&#x02013;action pairs reflect the most recent environmental information even under sparse updates. Additionally, the proposed architecture incorporating these approaches may also be implemented on neuromorphic hardware.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>DS: Writing &#x02013; original draft, Conceptualization, Data curation, Formal analysis, Methodology, Software, Writing &#x02013; review &#x00026; editing, Validation, Visualization. HyeoJ: Data curation, Methodology, Writing &#x02013; original draft. HyesJ: Formal analysis, Writing &#x02013; original draft, Visualization. YHJ: Software, Validation, Writing &#x02013; original draft. YJ: Investigation, Writing &#x02013; review &#x00026; editing, Validation. JYK: Writing &#x02013; review &#x00026; editing, Investigation, Methodology. JP: Funding acquisition, Writing &#x02013; review &#x00026; editing, Investigation. SL: Writing &#x02013; review &#x00026; editing, Funding acquisition, Resources. IK: Investigation, Writing &#x02013; review &#x00026; editing. J-KP: Writing &#x02013; review &#x00026; editing, Investigation. SP: Validation, Writing &#x02013; review &#x00026; editing, Software. HyuJ: Writing &#x02013; review &#x00026; editing, Software, Validation. H-ML: Writing &#x02013; review &#x00026; editing, Investigation, Supervision. JK: Conceptualization, Funding acquisition, Investigation, Methodology, Project administration, Resources, Supervision, Validation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<ack><title>Acknowledgments</title><p>We thank Sungsoo Han and Youngwoong Song for their technical assistance and valuable discussions.</p></ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>YHJ was employed by LG Electronics Inc.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abbott</surname> <given-names>L. F.</given-names></name></person-group> (<year>1999</year>). <article-title>Lapicque&#x00027;s introduction of the integrate-and-fire model neuron</article-title>. <source>Brain Res. Bull</source>. <volume>50</volume>, <fpage>303</fpage>&#x02013;<lpage>304</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0361-9230(99)00161-6</pub-id><pub-id pub-id-type="pmid">10643408</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Akl</surname> <given-names>M.</given-names></name> <name><surname>Sandamirskaya</surname> <given-names>Y.</given-names></name> <name><surname>Walter</surname> <given-names>F.</given-names></name> <name><surname>Knoll</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Porting deep spiking Q-networks to neuromorphic chip Loihi,&#x0201D;</article-title> in <source>ACM International Conference Proceeding Series</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>). doi: <pub-id pub-id-type="doi">10.1145/3477145.3477159</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Akopyan</surname> <given-names>F.</given-names></name> <name><surname>Sawada</surname> <given-names>J.</given-names></name> <name><surname>Cassidy</surname> <given-names>A.</given-names></name> <name><surname>Alvarez-Icaza</surname> <given-names>R.</given-names></name> <name><surname>Arthur</surname> <given-names>J.</given-names></name> <name><surname>Merolla</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>TrueNorth: design and tool flow of a 65 mW 1 million neuron programmable neurosynaptic chip</article-title>. <source>IEEE Trans. Comput. Aided Design Integrated Circuits Syst.</source> <volume>34</volume>, <fpage>1537</fpage>&#x02013;<lpage>1557</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TCAD.2015.2474396</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benjamin</surname> <given-names>B. V.</given-names></name> <name><surname>Gao</surname> <given-names>P.</given-names></name> <name><surname>McQuinn</surname> <given-names>E.</given-names></name> <name><surname>Choudhary</surname> <given-names>S.</given-names></name> <name><surname>Chandrasekaran</surname> <given-names>A. R.</given-names></name> <name><surname>Bussat</surname> <given-names>J. M.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Neurogrid: a mixed-analog-digital multichip system for large-scale neural simulations</article-title>. <source>Proc. IEEE</source> <volume>102</volume>, <fpage>699</fpage>&#x02013;<lpage>716</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JPROC.2014.2313565</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brzosko</surname> <given-names>Z.</given-names></name> <name><surname>Mierau</surname> <given-names>S. B.</given-names></name> <name><surname>Paulsen</surname> <given-names>O.</given-names></name></person-group> (<year>2019</year>). <article-title>Neuromodulation of spike-timing-dependent plasticity: past, present, and future</article-title>. <source>Neuron</source> <volume>103</volume>, <fpage>563</fpage>&#x02013;<lpage>581</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuron.2019.05.041</pub-id><pub-id pub-id-type="pmid">31437453</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Davies</surname> <given-names>M.</given-names></name> <name><surname>Srinivasa</surname> <given-names>N.</given-names></name> <name><surname>Lin</surname> <given-names>T.-H.</given-names></name> <name><surname>Chinya</surname> <given-names>G.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Choday</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2018</year>). <source>Loihi: A Neuromorphic Manycore Processor with On-Chip Learning</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://www.computer.org/micro">www.computer.org/micro</ext-link> (Accessed January 15, 2026).</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Geva</surname> <given-names>S.</given-names></name> <name><surname>Sitte</surname> <given-names>J.</given-names></name></person-group> (<year>1993</year>). <article-title>A cartpole experiment benchmark for trainable controllers</article-title>. <source>IEEE Control Syst. Magaz.</source> <volume>13</volume>, <fpage>40</fpage>&#x02013;<lpage>51</lpage>. doi: <pub-id pub-id-type="doi">10.1109/37.236324</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ha&#x0015F;egan</surname> <given-names>D.</given-names></name> <name><surname>Deible</surname> <given-names>M.</given-names></name> <name><surname>Earl</surname> <given-names>C.</given-names></name> <name><surname>D&#x00027;Onofrio</surname> <given-names>D.</given-names></name> <name><surname>Hazan</surname> <given-names>H.</given-names></name> <name><surname>Anwar</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Training spiking neuronal networks to perform motor control using reinforcement and evolutionary learning</article-title>. <source>Front. Comput. Neurosci.</source> <volume>16</volume>:<fpage>1017284</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fncom.2022.1017284</pub-id><pub-id pub-id-type="pmid">36249482</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book"><collab>International Technology Roadmap for Semiconductors (ITRS)</collab> (<year>2007</year>). <source>International Technology Roadmap for Semiconductors: Interconnect</source>. <publisher-loc>San Jose, CA</publisher-loc>: <publisher-name>ITRS</publisher-name>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kiselev</surname> <given-names>M.</given-names></name> <name><surname>Ivanitsky</surname> <given-names>A.</given-names></name> <name><surname>Larionov</surname> <given-names>D.</given-names></name></person-group> (<year>2025</year>). <article-title>A purely spiking approach to reinforcement learning</article-title>. <source>Cogn. Syst. Res.</source> <volume>89</volume>:<fpage>101317</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cogsys.2024.101317</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>G.</given-names></name> <name><surname>Deng</surname> <given-names>W.</given-names></name> <name><surname>Xie</surname> <given-names>X.</given-names></name> <name><surname>Huang</surname> <given-names>L.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). <article-title>Human-level control through directly trained deep spiking Q-networks</article-title>. <source>IEEE Trans. Cybern.</source> <volume>53</volume>, <fpage>7187</fpage>&#x02013;<lpage>7198</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TCYB.2022.3198259</pub-id><pub-id pub-id-type="pmid">36063509</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Pan</surname> <given-names>W.</given-names></name></person-group> (<year>2023</year>). <article-title>Spiking neural-networks-based data-driven control</article-title>. <source>Electronics</source> <volume>12</volume>:<fpage>310</fpage>. doi: <pub-id pub-id-type="doi">10.3390/electronics12020310</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Madadi Asl</surname> <given-names>M.</given-names></name> <name><surname>Valizadeh</surname> <given-names>A.</given-names></name> <name><surname>Tass</surname> <given-names>P. A.</given-names></name></person-group> (<year>2017</year>). <article-title>Dendritic and axonal propagation delays determine emergent structures of neuronal networks with plastic synapses</article-title>. <source>Sci. Rep.</source> <volume>7</volume>:<fpage>39682</fpage>. doi: <pub-id pub-id-type="doi">10.1038/srep39682</pub-id><pub-id pub-id-type="pmid">28045109</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mehonic</surname> <given-names>A.</given-names></name> <name><surname>Kenyon</surname> <given-names>A. J.</given-names></name></person-group> (<year>2022</year>). <article-title>Brain-inspired computing needs a master plan</article-title>. <source>Nature</source> <volume>604</volume>, <fpage>255</fpage>&#x02013;<lpage>260</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41586-021-04362-w</pub-id><pub-id pub-id-type="pmid">35418630</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rosenfeld</surname> <given-names>A.</given-names></name> <name><surname>Taylor</surname> <given-names>M. E.</given-names></name> <name><surname>Kraus</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Speeding up tabular reinforcement learning using state-action similarities,&#x0201D;</article-title> in <source>Proceedings of the 16th International Conference on Autonomous Agents and Multiagent Systems (AAMAS 2017)</source>, eds. E. Durfee, M. Winikoff, K. Larson, and S. Das (Richland, SC: International Foundation for Autonomous Agents and Multiagent Systems), <fpage>1722</fpage>&#x02013;<lpage>1724</lpage>.</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Salomo</surname> <given-names>Y.</given-names></name> <name><surname>Syafalni</surname> <given-names>I.</given-names></name> <name><surname>Sutisna</surname> <given-names>N.</given-names></name> <name><surname>Adiono</surname> <given-names>T.</given-names></name></person-group> (<year>2025</year>). <article-title>Hardware-software stitching algorithm in lightweight Q-learning system on chip (SoC) for shortest path optimization</article-title>. <source>IEEE Access</source> <volume>13</volume>, <fpage>105044</fpage>&#x02013;<lpage>105062</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3578681</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schuman</surname> <given-names>C. D.</given-names></name> <name><surname>Kulkarni</surname> <given-names>S. R.</given-names></name> <name><surname>Parsa</surname> <given-names>M.</given-names></name> <name><surname>Mitchell</surname> <given-names>J. P.</given-names></name> <name><surname>Date</surname> <given-names>P.</given-names></name> <name><surname>Kay</surname> <given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>Opportunities for neuromorphic computing algorithms and applications</article-title>. <source>Nat. Comput. Sci.</source> <volume>2</volume>, <fpage>10</fpage>&#x02013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s43588-021-00184-y</pub-id><pub-id pub-id-type="pmid">38177712</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Seger</surname> <given-names>C.</given-names></name></person-group> (<year>2018</year>). <source>An Investigation of Categorical Variable Encoding Techniques in Machine Learning: Binary Versus One-hot and Feature Hashing</source> (<publisher-loc>Master&#x00027;s thesis</publisher-loc>). KTH Royal Institute of Technology School of Electrical Engineering and Computer Science, Stockholm, Sweden.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Siddique</surname> <given-names>A.</given-names></name> <name><surname>Vai</surname> <given-names>M. I.</given-names></name> <name><surname>Pun</surname> <given-names>S. H.</given-names></name></person-group> (<year>2023</year>). <article-title>A low cost neuromorphic learning engine based on a high performance supervised SNN learning algorithm</article-title>. <source>Sci. Rep.</source> <volume>13</volume>:<fpage>6280</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-023-32120-7</pub-id><pub-id pub-id-type="pmid">37072443</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Span&#x000F2;</surname> <given-names>S.</given-names></name> <name><surname>Cardarilli</surname> <given-names>G. C.</given-names></name> <name><surname>Di Nunzio</surname> <given-names>L.</given-names></name> <name><surname>Fazzolari</surname> <given-names>R.</given-names></name> <name><surname>Giardino</surname> <given-names>D.</given-names></name> <name><surname>Matta</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>An efficient hardware implementation of reinforcement learning: the q-learning algorithm</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>186340</fpage>&#x02013;<lpage>186351</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2961174</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Sutton</surname> <given-names>R. S.</given-names></name> <name><surname>Barto</surname> <given-names>A. G.</given-names></name></person-group> (<year>2015</year>). <source>Reinforcement Learning: An Introduction, 2nd Edn</source>. <publisher-loc>Cambridge</publisher-loc>: <publisher-name>MIT Press</publisher-name>.</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Taherkhani</surname> <given-names>A.</given-names></name> <name><surname>Belatreche</surname> <given-names>A.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Cosma</surname> <given-names>G.</given-names></name> <name><surname>Maguire</surname> <given-names>L. P.</given-names></name> <name><surname>McGinnity</surname> <given-names>T. M.</given-names></name></person-group> (<year>2020</year>). <article-title>A review of learning in biologically plausible spiking neural networks</article-title>. <source>Neural Netw.</source> <volume>122</volume>, <fpage>253</fpage>&#x02013;<lpage>272</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2019.09.036</pub-id><pub-id pub-id-type="pmid">31726331</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>G.</given-names></name> <name><surname>Kumar</surname> <given-names>N.</given-names></name> <name><surname>Yoo</surname> <given-names>R.</given-names></name> <name><surname>Michmizos</surname> <given-names>K. P.</given-names></name></person-group> (<year>2020</year>). <source>Deep Reinforcement Learning with Population-Coded Spiking Neural Network for Continuous Control</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://github.com/combra-lab/pop-spiking-deep-rl">https://github.com/combra-lab/pop-spiking-deep-rl</ext-link> (Accessed January 15, 2026).</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tiwari</surname> <given-names>G.</given-names></name> <name><surname>Nakhate</surname> <given-names>S.</given-names></name> <name><surname>Pathak</surname> <given-names>A.</given-names></name> <name><surname>Jain</surname> <given-names>A.</given-names></name> <name><surname>Penurkar</surname> <given-names>S.</given-names></name></person-group> (<year>2025</year>). <article-title>&#x0201C;Hardware accelerators for deep learning applications,&#x0201D;</article-title> in <source>2025 IEEE International Students&#x00027; Conference on Electrical, Electronics and Computer Science, SCEECS 2025</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Institute of Electrical and Electronics Engineers Inc.</publisher-name>). doi: <pub-id pub-id-type="doi">10.1109/SCEECS64059.2025.10940371</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tran</surname> <given-names>D. D.</given-names></name> <name><surname>Le</surname> <given-names>T. T.</given-names></name> <name><surname>Duong</surname> <given-names>M. T.</given-names></name> <name><surname>Pham</surname> <given-names>M. Q.</given-names></name> <name><surname>Nguyen</surname> <given-names>M. S.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;FPGA design for deep Q-network: a case study in Cartpole environment,&#x0201D;</article-title> in <source>2022 International Conference on Multimedia Analysis and Pattern Recognition, MAPR 2022</source> &#x02013; <italic>Proceedings</italic> (New York, NY: Institute of Electrical and Electronics Engineers Inc.). doi: <pub-id pub-id-type="doi">10.1109/MAPR56351.2022.9925007</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wijekoon</surname> <given-names>J. H. B.</given-names></name> <name><surname>Dudek</surname> <given-names>P.</given-names></name></person-group> (<year>2011</year>). <article-title>&#x0201C;Analogue CMOS circuit implementation of a dopamine modulated synapse,&#x0201D;</article-title> in <source>Proceedings of the IEEE International Symposium on Circuits and Systems (ISCAS 2011)</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Institute of Electrical and Electronics Engineers Inc.</publisher-name>), <fpage>877</fpage>&#x02013;<lpage>880</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ISCAS.2011.5937706</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yamazaki</surname> <given-names>K.</given-names></name> <name><surname>Vo-Ho</surname> <given-names>V. K.</given-names></name> <name><surname>Bulsara</surname> <given-names>D.</given-names></name> <name><surname>Le</surname> <given-names>N.</given-names></name></person-group> (<year>2022</year>). <article-title>Spiking neural networks and their applications: a review</article-title>. <source>Brain Sci.</source> <volume>12</volume>:<fpage>863</fpage>. doi: <pub-id pub-id-type="doi">10.3390/brainsci12070863</pub-id><pub-id pub-id-type="pmid">35884670</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zanatta</surname> <given-names>L.</given-names></name> <name><surname>Di Mauro</surname> <given-names>A.</given-names></name> <name><surname>Barchi</surname> <given-names>F.</given-names></name> <name><surname>Bartolini</surname> <given-names>A.</given-names></name> <name><surname>Benini</surname> <given-names>L.</given-names></name> <name><surname>Acquaviva</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Directly-trained spiking neural networks for deep reinforcement learning: energy efficient implementation of event-based obstacle avoidance on a neuromorphic accelerator</article-title>. <source>Neurocomputing</source> <volume>562</volume>:<fpage>126885</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neucom.2023.126885</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3006571/overview">Jiangrong Shen</ext-link>, Xi&#x00027;an Jiaotong University, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2651837/overview">Zhaokun Zhou</ext-link>, Peking University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3073332/overview">Rong Xiao</ext-link>, Sichuan University, China</p>
</fn>
</fn-group>
</back>
</article>