<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Neurosci.</journal-id>
<journal-title>Frontiers in Computational Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Neurosci.</abbrev-journal-title>
<issn pub-type="epub">1662-5188</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fncom.2024.1508297</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Spike-HAR&#x0002B;&#x0002B;: an energy-efficient and lightweight parallel spiking transformer for event-based human action recognition</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Lin</surname> <given-names>Xinxu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2767383/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Liu</surname> <given-names>Mingxuan</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2838558/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chen</surname> <given-names>Hong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2750504/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>School of Integrated Circuits, Tsinghua University</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>State Key Laboratory of Integrated Chips and Systems, Frontier Institute of Chip and System, Fudan University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Greater Bay Area National Center of Technology Innovation, Research Institute of Tsinghua University in Shenzhen</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>School of Biomedical Engineering, Tsinghua University</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Cong Shi, Chongqing University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Man Yao, Chinese Academy of Sciences (CAS), China</p>
<p>Peng Feng, Chinese Academy of Sciences (CAS), China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Hong Chen <email>hongchen&#x00040;tsinghua.edu.cn</email></corresp>
<fn fn-type="equal" id="fn002"><p>&#x02020;These authors have contributed equally to this work and share first authorship</p></fn></author-notes>
<pub-date pub-type="epub">
<day>26</day>
<month>11</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>18</volume>
<elocation-id>1508297</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>10</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>11</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Lin, Liu and Chen.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Lin, Liu and Chen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Event-based cameras are suitable for human action recognition (HAR) by providing movement perception with highly dynamic range, high temporal resolution, high power efficiency and low latency. Spike Neural Networks (SNNs) are naturally suited to deal with the asynchronous and sparse data from the event cameras due to their spike-based event-driven paradigm, with less power consumption compared to artificial neural networks. In this paper, we propose two end-to-end SNNs, namely Spike-HAR and Spike-HAR&#x0002B;&#x0002B;, to introduce spiking transformer into event-based HAR. Spike-HAR includes two novel blocks: a spike attention branch, which enables model to focus on regions with high spike rates, reducing the impact of noise to improve the accuracy, and a parallel spike transformer block with simplified spiking self-attention mechanism, increasing computational efficiency. To better extract crucial information from high-level features, we modify the architecture of the spike attention branch and extend it in Spike-HAR to a higher dimension, proposing Spike-HAR&#x0002B;&#x0002B; to further enhance classification performance. Comprehensive experiments were conducted on four HAR datasets: SL-Animals-DVS, N-LSA64, DVS128 Gesture and DailyAction-DVS, to demonstrate the superior performance of our proposed model. Additionally, the proposed Spike-HAR and Spike-HAR&#x0002B;&#x0002B; require only 0.03 and 0.06 mJ, respectively, to process a sequence of event frames, with model sizes of only 0.7 and 1.8 M. This efficiency positions it as a promising new SNN baseline for the HAR community. Code is available at <ext-link ext-link-type="uri" xlink:href="https://github.com/Arktis2022/Spike-SLR">Spike-HAR&#x0002B;&#x0002B;</ext-link>.</p></abstract>
<kwd-group>
<kwd>spiking neural network</kwd>
<kwd>human action recognition</kwd>
<kwd>transformer</kwd>
<kwd>attention branch</kwd>
<kwd>event-based vision</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="6"/>
<equation-count count="16"/>
<ref-count count="81"/>
<page-count count="13"/>
<word-count count="8960"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Human action recognition (HAR) involves identifying and understanding human movements and has numerous applications in the real world (Sun et al., <xref ref-type="bibr" rid="B55">2022</xref>). For instance, HAR can be employed in visual surveillance systems to detect hazardous activities and monitor human behavior, thereby ensuring safe operations (Lin et al., <xref ref-type="bibr" rid="B29">2008</xref>). Additionally, HAR can facilitate sign language recognition (SLR). According to the latest data from the World Federation of the Deaf, there are 70 million deaf individuals worldwide using over 200 sign languages (Murray, <xref ref-type="bibr" rid="B39">2018</xref>). However, learning sign language can be challenging and time-consuming, creating communication barriers for the deaf community (Hu L. et al., <xref ref-type="bibr" rid="B21">2023</xref>). To address this issue, HAR for sign language recognition has been extensively researched. Most of the works focused on using RGB or gray-scale videos as input for HAR (Wang et al., <xref ref-type="bibr" rid="B61">2017</xref>; K&#x00131;nd&#x00131;roglu et al., <xref ref-type="bibr" rid="B27">2022</xref>; V&#x000E1;zquez-Enr&#x000ED;quez et al., <xref ref-type="bibr" rid="B58">2021</xref>; Mercanoglu Sincan and Keles, <xref ref-type="bibr" rid="B38">2022</xref>; Shen et al., <xref ref-type="bibr" rid="B48">2024</xref>; Wang F. et al., <xref ref-type="bibr" rid="B60">2023</xref>), due to their popularity and easy access. However, the recognition results of RGB-based HAR methods are inevitably influenced by the motion blur inherent to RGB cameras and static background noise (Wang et al., <xref ref-type="bibr" rid="B63">2019</xref>; Wang Y. et al., <xref ref-type="bibr" rid="B66">2022</xref>).</p>
<p>As an emerging neuromorphic sensor, the event camera detects changes in brightness for each pixel independently, generating an event stream asynchronously and sparsely. The difference between RGB video frames [from LSA64 (Ronchetti et al., <xref ref-type="bibr" rid="B43">2023</xref>)] and DVS event frames (from N-LSA64) is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The event camera features high temporal resolution, low latency, low power consumption, and a wide dynamic range (Su et al., <xref ref-type="bibr" rid="B53">2022</xref>), which can effectively address issues related to motion blur and static background noise. That is, event cameras hold significant advantages in the field of HAR. The current state-of-the-art (SOTA) approaches for event-based HAR involve firstly designing event aggregation strategies converting the asynchronous output of the event camera into synfirst chronous visual frames, followed by processing using Artificial Neural Networks (ANNs) (Ghosh et al., <xref ref-type="bibr" rid="B15">2019</xref>; Amir et al., <xref ref-type="bibr" rid="B3">2017</xref>; Baldwin et al., <xref ref-type="bibr" rid="B4">2022</xref>; Cannici et al., <xref ref-type="bibr" rid="B7">2020</xref>; Innocenti et al., <xref ref-type="bibr" rid="B25">2021</xref>; Sabater et al., <xref ref-type="bibr" rid="B44">2022</xref>), which require considerable computational power, posing challenges for deployment on edge devices.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p><bold>(a)</bold> Comparison of RGB video frames and DVS data frames for sign language Opaque (one-handed sign). <bold>(b)</bold> Comparison of RGB video frames and DVS data frames for sign language breakfast (two-handed sign).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-18-1508297-g0001.tif"/>
</fig>
<p>As third-generation neural networks, Spike Neural Networks (SNNs) are designed with biological plausibility, mimicking the dynamics of brain neurons to encode and transmit information in the form of spikes (Maass, <xref ref-type="bibr" rid="B36">1997</xref>). Compared to ANNs, the event-driven nature of SNNs significantly reduces energy consumption when running on neuromorphic chips (Zhang et al., <xref ref-type="bibr" rid="B77">2023</xref>, <xref ref-type="bibr" rid="B78">2021</xref>). However, current SNN-based HAR tasks still face challenges of lack of datasets and low recognition accuracy (Shi et al., <xref ref-type="bibr" rid="B49">2023</xref>).</p>
<p>In this paper, we propose two models, Spike-HAR and Spike-HAR&#x0002B;&#x0002B;, to simultaneously reduce power consumption and enhance recognition accuracy in event-based HAR. Spike-HAR integrates a patch embedding (PE) block, parallel transformer blocks, a spike attention branch, and a classification head. To further improve performance, we modify the architecture and position of spike attention branch in Spike-HAR according to the Hu et al. (<xref ref-type="bibr" rid="B22">2024</xref>) and extend it to a higher dimension, proposing Spike-HAR&#x0002B;&#x0002B;, which enables better extraction of crucial information from high-level features. As illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>, experiments on the SL-Animals-DVS dataset (Vasudevan et al., <xref ref-type="bibr" rid="B56">2022</xref>) demonstrate that both models significantly outperform other event-based HAR systems while maintaining lower levels of power consumption.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Accuracy vs. inference energy of different neural methods implemented in Intel Stratix 10 TX (Corporation, <xref ref-type="bibr" rid="B8">2023</xref>) (for ANNs) or ROLLS (Qiao et al., <xref ref-type="bibr" rid="B42">2015</xref>) (for SNNs). The size of the markers denotes the number of parameters.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-18-1508297-g0002.tif"/>
</fig>
<p>This paper is an extended version of our prior work (Lin et al., <xref ref-type="bibr" rid="B30">2024</xref>) accepted by BMVC 2024. The main differences with the conference version are as follows: (1) besides the Spike-HAR based on the Parallel Spiking Transformer (referred to as Spike-SLR in the BMVC version), we newly propose Spike-HAR&#x0002B;&#x0002B;, which is better suited for recognizing long-duration actions; (2) the application scope of the models are extended from sign language recognition to human action recognition, with comprehensive testing conducted on two additional datasets: DVS128 Gesture (Amir et al., <xref ref-type="bibr" rid="B3">2017</xref>) and DailyAction-DVS (Liu et al., <xref ref-type="bibr" rid="B33">2021</xref>), achieving SOTA performance; (3) a detailed overview about traditional ANN-based and SNN-based HAR methods, as well as the development of spiking transformers are discussed in the related work. To sum up, the main contributions of this paper are listed:</p>
<p>(1) We propose the Spike-HAR family, i.e., Spike-HAR and Spike-HAR&#x0002B;&#x0002B;, which mainly consists of a powerful parallel spike transformer block. To the best of our knowledge, it is the first spiking transformer specifically designed for event-based HAR. To enhance the model&#x00027;s spatio-temporal attention to fine-grained action features while maintaining energy efficiency and a lightweight design, we employ a parallel spiking transformer. In this architecture, multi-layer perceptrons (MLPs) and simplified attention sub-modules (CB-S3A) operate in parallel to improve overall efficiency.</p>
<p>(2) We first introduce attention mask mechanisms into SNNs and incorporate a spike attention branch in our model to extract key regions from the input event streams. Additionally, we improve the attention operation for Spike-HAR&#x0002B;&#x0002B;, utilizing high-dimensional features extracted through a patch embedding (PE) block to accommodate the recognition of long-duration actions. Experiments demonstrate that, although the parameter count and power consumption of Spike-HAR&#x0002B;&#x0002B; increase slightly, the accuracy of HAR improves significantly.</p>
<p>(3) Experimental results on the public datasets SL-Animals-DVS (Vasudevan et al., <xref ref-type="bibr" rid="B56">2022</xref>), N-LSA64 (Ronchetti et al., <xref ref-type="bibr" rid="B43">2023</xref>) [converted using the v2e (Hu et al., <xref ref-type="bibr" rid="B23">2021</xref>) method], DVS128 Gesture (Amir et al., <xref ref-type="bibr" rid="B3">2017</xref>), and DailyAction-DVS (Liu et al., <xref ref-type="bibr" rid="B33">2021</xref>) show that the proposed Spike-HAR family effectively balances model size and recognition accuracy. Specifically, the proposed Spike-HAR and Spike-HAR&#x0002B;&#x0002B; require only 0.03 and 0.06 mJ, respectively, to process a sequence of event frames, with model size of just 0.7 and 1.8 M.</p>
<p>In the rest of the paper, Section 2 presents the related work on event-based HAR and spiking transformers. Section 3 begins with an overview of the overall architecture of Spike-HAR and Spike-HAR&#x0002B;&#x0002B;, followed by a detailed description of each model component. Section 4 introduces four HAR benchmark datasets and evaluation metrics, along with rigorous ablation studies, visualizations, and performance evaluations of the proposed models. Finally, Section 5 concludes the paper.</p></sec>
<sec id="s2">
<title>2 Related work</title>
<sec>
<title>2.1 Event-based human action recognition</title>
<p>Human action recognition aims to assign labels to various human behaviors and has wide applications in fields such as visual surveillance systems (Prati et al., <xref ref-type="bibr" rid="B41">2019</xref>; Lin et al., <xref ref-type="bibr" rid="B29">2008</xref>; Nasir et al., <xref ref-type="bibr" rid="B40">2022</xref>), sign language recognition (Lin et al., <xref ref-type="bibr" rid="B30">2024</xref>), autonomous navigation systems (Wang Q. et al., <xref ref-type="bibr" rid="B62">2022</xref>), and video retrieval (Sahoo et al., <xref ref-type="bibr" rid="B46">2020</xref>). Traditional HAR methods commonly use RGB or grayscale video as input due to their accessibility. However, HAR based on RGB modalities is not robust to illumination changes and is susceptible to motion artifacts (Sun et al., <xref ref-type="bibr" rid="B55">2022</xref>). Additionally, the large data size of RGB videos results in high computational costs when modeling spatiotemporal context for HAR. To address above problem, alternative data forms for HAR have emerged, such as skeleton (Wang and Yan, <xref ref-type="bibr" rid="B59">2023</xref>), depth (Sahoo et al., <xref ref-type="bibr" rid="B46">2020</xref>), infrared sequences (Ding et al., <xref ref-type="bibr" rid="B10">2022</xref>), point clouds (Yu et al., <xref ref-type="bibr" rid="B76">2022</xref>), and event streams. This study focuses on event-based HAR, as event cameras offer high dynamic range, low latency, low power consumption, and eliminate motion blur, making them well-suited for HAR. Furthermore, the captured frames typically lack background information, which aids in action understanding.</p>
<p>The methods for event-based HAR can be primarily categorized into ANN-based and SNN-based (Gao et al., <xref ref-type="bibr" rid="B14">2023</xref>). For ANN-based methods, representative studies mainly utilize 3D CNNs or transformers to learn features in both spatial and temporal domains, thereby aggregating information from adjacent frames. For example, Wang et al. (<xref ref-type="bibr" rid="B64">2024</xref>) presented a novel event stream-based action recognition model called EVMamba, which integrates a spatial plane multi-directional scanning mechanism with an innovative voxel temporal scanning mechanism to effectively extract spatio-temporal information from event streams. Acin et al. (<xref ref-type="bibr" rid="B2">2023</xref>) introduced VK-SITS, a new event data representation using the ResNet18 network, which outperformed other methods such as TORE (Baldwin et al., <xref ref-type="bibr" rid="B4">2022</xref>) and SITS (Manderscheid et al., <xref ref-type="bibr" rid="B37">2019</xref>). Additionally, Sabater et al. (<xref ref-type="bibr" rid="B44">2022</xref>) developed EVT, an efficient transformer model that leverages the sparsity of event data, achieving SOTA results on the SL-Animals-DVS dataset. They further improved EVT by employing a finer patch-based event data representation with richer spatio-temporal information, resulting in the introduction of the EVT&#x0002B; model (Sabater et al., <xref ref-type="bibr" rid="B45">2023</xref>). Gao et al. (<xref ref-type="bibr" rid="B14">2023</xref>) proposed the EV-ACT framework, which consists of an event voxel filtering module, a learnable multi-representation fusion module, an event-based slow-fast network, and an event-based spatio-temporal attention mechanism. This framework was tested on a new event-based HAR benchmark called THU<sup>E &#x02212; ACT</sup>-50 and its accompanying dataset, THU<sup>E &#x02212; ACT</sup>-50-CHL. Although ANN-based methods have achieved SOTA performance, they often involve high power consumption and a large number of model parameters due to the large data volume and significant information redundancy introduced by the temporal dimension, making them less suitable for edge applications in HAR. To address the problem, SNN-based methods have been proposed, leveraging their inherent temporal dynamics and energy efficiency. Specifically, Vasudevan et al. (<xref ref-type="bibr" rid="B56">2022</xref>) introduced the SL-Animals-DVS dataset and evaluated three types of SNNs, including SLAYER (Shrestha and Orchard, <xref ref-type="bibr" rid="B50">2018</xref>), STBP (Wu et al., <xref ref-type="bibr" rid="B68">2018</xref>), and DECOLLE (Kaiser et al., <xref ref-type="bibr" rid="B26">2020</xref>), where the test accuracy for all models remained below 75%. Liu et al. (<xref ref-type="bibr" rid="B33">2021</xref>) were the first to apply motion information in SNNs for event-based action recognition, surpassing existing SNN methods on three datasets, including DailyAction-DVS. Although SNNs can achieve energy-efficient recognition, they often yield suboptimal results.</p></sec>
<sec>
<title>2.2 Spiking transformers</title>
<p>ANN-based transformers have achieved success in fields such as vision and natural language processing (NLP) (Achiam et al., <xref ref-type="bibr" rid="B1">2023</xref>; Han et al., <xref ref-type="bibr" rid="B19">2022</xref>). However, the exploration of self-attention (SA) mechanisms based on SNNs remains limited, primarily because the multiplication operations inherent in vanilla self-attention (VSA) mechanism (Vaswani et al., <xref ref-type="bibr" rid="B57">2017</xref>) are incompatible with SNNs. Recently, research has increasingly focused on developing the spiking transformer, aiming at eliminating multiplication operations in SA to reduce computational complexity. Zhou et al. (<xref ref-type="bibr" rid="B79">2022</xref>) were the first to introduce spiking transformer model, termed Spikformer, which utilizes spike-based Query, Key, and Value to model sparse visual features, thereby avoiding softmax computations. Subsequently, Yao et al. (<xref ref-type="bibr" rid="B74">2024b</xref>) introduced the Spike-driven Transformer, which enhances the spiking self-attention (SSA) mechanism in the Spikeformer. They proposed a Spike Driven Self-Attention (SDSA) that utilizes only masking and addition to implement the SA mechanism, reducing the computational complexity from <italic>O</italic>(<italic>ND</italic><sup>2</sup>) to <italic>O</italic>(<italic>ND</italic>). Wang Z. et al. (<xref ref-type="bibr" rid="B67">2023</xref>) introduced a novel Masked Spike Transformer (MST) framework, incorporating a Random Spike Masking (RSM) method, to further prune redundant spikes and reduce energy consumption without sacrificing performance. These exploration of spiking transformers enhance the learning capabilities of SNNs, enabling their application in various fields such as audio-visual classification, human pose tracking, and remote photoplethysmography (Guo et al., <xref ref-type="bibr" rid="B18">2023</xref>; Zou et al., <xref ref-type="bibr" rid="B81">2023</xref>; Liu et al., <xref ref-type="bibr" rid="B31">2024</xref>). However, there is a lack of spiking transformers specifically designed for event-based HAR. We are the first to propose Spike-HAR (Lin et al., <xref ref-type="bibr" rid="B30">2024</xref>), which is primarily composed of an energy-efficient parallel spiking transformer and has been tested on two DVS sign language datasets. Subsequently, SVFormer (Yu et al., <xref ref-type="bibr" rid="B75">2024</xref>) was introduced as a direct training spiking transformer for efficient video action recognition, but it mainly focuses on RGB-based HAR. Wang X. et al. (<xref ref-type="bibr" rid="B65">2023</xref>) proposed a model called SSTFormer, which bridges SNNs and memory support transformers. However, SSTFormer is a hybrid SNN-ANN network requires both RGB frames and event streams to perform HAR. Therefore, dedicated spiking transformer models for event-based HAR still require further investigation and validation on larger-scale datasets.</p></sec></sec>
<sec sec-type="methods" id="s3">
<title>3 Methodology</title>
<p>The proposed Spike-HAR and Spike-HAR&#x0002B;&#x0002B; apply the spiking transformer to HAR tasks. We utilize the SNNs algorithm provided in the SpikingJelly platform (Fang et al., <xref ref-type="bibr" rid="B11">2023</xref>), employing the Leak Integrate and Fire (LIF) (Stein and Hodgkin, <xref ref-type="bibr" rid="B52">1967</xref>) neural model for constructing the spiking neuron layers. LIF can be simply expressed by the following equation:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>H</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>V</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>V</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E2"><label>(2)</label><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>S</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x00398;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>H</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E3"><label>(3)</label><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>V</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>S</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>t</italic> denotes the timestep, &#x003C4; represents the membrane time constant, <italic>X</italic>[<italic>t</italic>] donates the synaptic input current at time step <italic>t</italic>, and <italic>H</italic>[<italic>t</italic>] is the neuron&#x00027;s membrane potential post charging and pre-spike, derived by integrating the input current. The spike occurrence at time <italic>t</italic>, denoted by <italic>S</italic>[<italic>t</italic>], is determined by the Heaviside step function &#x00398;, which outputs a spike (value of 1) when <italic>H</italic>[<italic>t</italic>] surpasses the firing threshold <italic>V</italic><sub><italic>th</italic></sub>, indicating an action potential. <italic>V</italic>[<italic>t</italic>] represents the membrane potential after spiking, which equals to <italic>H</italic>[<italic>t</italic>] if no spike occurs and otherwise reset to <italic>V</italic><sub><italic>reset</italic></sub>.</p>
<sec>
<title>3.1 Overall architecture</title>
<p>To lighten the models, we adopt less weight parameters and simpler model structures. The parameters of Spike-HAR and Spike-HAR&#x0002B;&#x0002B; is provided in <xref ref-type="table" rid="T6">Table 6</xref>, which are less than most models. In terms of model structure, Spike-HAR and Spike-HAR&#x0002B;&#x0002B; use a more simplified data preprocessing layer compared to the Spiking Transformer. And in both models we only use two MLP layers. <xref ref-type="fig" rid="F3">Figure 3a</xref> illustrates the structure of Spike-HAR and Spike-HAR&#x0002B;&#x0002B;, both of which consist of four main components: the patch embedding (PE) block, the parallel spike-driven transformer block, the spike attention branch, and the classification head. The PE block extracts spatio-temporal representations from the input DVS frames, while the CB-S3A module in the transformer and the spike firing rate map in the spike attention branch direct the model&#x00027;s focus toward key features. The final prediction head maps these features to possible sign language expressions.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Framework of Spike-HAR and Spike-HAR&#x0002B;&#x0002B;. We follow the network structure in Yao et al. (<xref ref-type="bibr" rid="B74">2024b</xref>). It consists of an SNN-based patch embedding (PE) block, several parallel spike-driven transformer blocks, a spike attention branch, and a SNN-based predictor head. <bold>(a)</bold> Architecture of Spike-HAR and Spike-HAR&#x0002B;&#x0002B;. <bold>(b)</bold> Attention branch for Spike-HAR. <bold>(c)</bold> Attention branch for Spike-HAR&#x0002B;&#x0002B;.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-18-1508297-g0003.tif"/>
</fig><p>Given a 2D DVS frames sequence <inline-formula><mml:math id="M4"><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>, where <italic>T</italic><sub>0</sub>, 2, <italic>H</italic><sub>0</sub>, <italic>W</italic><sub>0</sub> represent the time step, initial number of channels, height and weight respectively. Firstly we randomly select continuous event frames with a time step of <italic>T</italic>(<italic>T</italic> &#x02264; <italic>T</italic><sub>0</sub>) and crop each event frame spatially to obtain the preprocessed frames (PR), denoted as <italic>I</italic>&#x02208;&#x0211D;<sup><italic>T</italic>&#x000D7;2 &#x000D7; <italic>H</italic>&#x000D7;<italic>W</italic></sup>. The SNN-Based PE block, consisting of four 2D convolutional (Conv2D) layers, three batch normalization (BN) layers, three SNN layers and two max pooling (MP) layers, downsamples the input frames and partitioning them into spatio-temporal spike tokens <inline-formula><mml:math id="M5"><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msup></mml:math></inline-formula>, where <italic>D</italic> represents the number of channels. Before entering the data into the parallel Spike-driven Transformer block, we use membrane potential residual connection to avoid network degradation, adding <italic>S</italic><sub><italic>PE</italic></sub> and the output <italic>I</italic><sub><italic>PE</italic></sub> of the initial three convolutional layers and resulting the input <italic>S</italic><sub>0</sub> of the same shape as <italic>S</italic><sub><italic>PE</italic></sub>. Therefore, the SNN-based PE block can be written as follows:</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M6"><mml:mrow><mml:mi>I</mml:mi><mml:mo>=</mml:mo><mml:mtext>PR</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:msub><mml:mi>I</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi>I</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></disp-formula>
<disp-formula id="E5"><label>(5)</label><mml:math id="M7"><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>P</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>PE</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mi>I</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>P</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>H</mml:mi><mml:mn>4</mml:mn></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>W</mml:mi><mml:mn>4</mml:mn></mml:mfrac></mml:mrow></mml:msup></mml:mrow></mml:math></disp-formula>
<disp-formula id="E6"><label>(6)</label><mml:math id="M8"><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>P</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>P</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:msub><mml:mi>S</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>H</mml:mi><mml:mn>4</mml:mn></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>W</mml:mi><mml:mn>4</mml:mn></mml:mfrac></mml:mrow></mml:msup></mml:mrow></mml:math></disp-formula>
<p>Then, the spike sequence <italic>S</italic><sub>0</sub> is passed to the parallel spike-driven transformer blocks, which consists of a conv-based simplified spiking self-attention (CB-S3A) block and a MLP block. As the main component in Spike-HAR and Spike-HAR&#x0002B;&#x0002B;, CB-S3A, which just performs the convolution operation in spike-form Query (<italic>Q</italic>) and Key (<italic>K</italic>), offers an efficient method to model the local-global information of frames without softmax. In addition, the spike fire map generated by the spike attention branch performs mask operation on the data produced by the second convolution in the MLP block, which makes model more focus on local features. The outputs of the MLP and the CB-S3A blocks are summed together, and the sum is then added to the input <italic>S</italic><sub>0</sub> again using membrane potential residual connection (RES). After <italic>L</italic> transformer blocks, the final output membrane potentials <italic>S</italic><sub><italic>L</italic></sub> is obtained. To obtain the pulse expression just consisting of 0 and 1, <italic>S</italic><sub><italic>L</italic></sub> then is passed to a spike neural layer (<inline-formula><mml:math id="M9"><mml:mrow><mml:mi mathvariant="script">SN</mml:mi></mml:mrow></mml:math></inline-formula>), resulting in <italic>S</italic><sub><italic>E</italic></sub>. Finally, the <italic>S</italic><sub><italic>E</italic></sub> will be sent to a SNN-based classification head (SCH) to output the classification result <italic>Y</italic>. To summary, the output of CB-S3A, MLP and SCH can be written as follows:</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M10"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mi>S</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mtext>CB-S3A</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mtext>MLP</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mtext>&#x02009;&#x02009;&#x02009;</mml:mtext></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:msub><mml:mi>S</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>H</mml:mi><mml:mn>4</mml:mn></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>W</mml:mi><mml:mn>4</mml:mn></mml:mfrac></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mn>0...</mml:mn><mml:mi>L</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E8"><label>(8)</label><mml:math id="M11"><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mi>E</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant='script'>S</mml:mi><mml:mi mathvariant='script'>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:msub><mml:mi>S</mml:mi><mml:mi>E</mml:mi></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>H</mml:mi><mml:mn>4</mml:mn></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mi>W</mml:mi><mml:mn>4</mml:mn></mml:mfrac></mml:mrow></mml:msup></mml:mrow></mml:math></disp-formula>
<disp-formula id="E9"><label>(9)</label><mml:math id="M12"><mml:mrow><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>C</mml:mi><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mi>E</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></disp-formula>
</sec>
<sec>
<title>3.2 Attention masks</title>
<p>DVS data can be influenced by noise from various sources, such as environmental background noise. As neural networks deepening, some noise may be amplified, causing the model to focus on irrelevant features. Inspired by Liu X. et al. (<xref ref-type="bibr" rid="B34">2020</xref>), we insert attention blocks into our model to minimize the negative impact of background noise while allowing the model to focus on the target area and local features. In order to take note of the difference among different body parts, attention mask is applied to assign higher weights to pixels with stronger spike signals, while it is also the bridge between attention appearance and the backbone network. The difference between Spike-HAR and Spike-HAR&#x0002B;&#x0002B; lies in their implementation of attention branch. In Spike-HAR, the attention map is generated by directly averaging at the frame level, while Spike-HAR&#x0002B;&#x0002B; performs information extraction at a high-dimensional feature scale. The implementation methods of both are described in detail below.</p>
<sec>
<title>3.2.1 Spike-HAR</title>
<p>As shown in <xref ref-type="fig" rid="F3">Figure 3b</xref>, unlike the data processing operations performed in the PE block, we first perform a sum-average-repeat (SAR) operation on the data in the attention appearance. Specifically, we sum the event frames in the time dimension to combine multiple frames <italic>I</italic>&#x02208;&#x0211D;<sup><italic>T</italic>&#x000D7;2 &#x000D7; <italic>H</italic>&#x000D7;<italic>W</italic></sup> into a single frame <inline-formula><mml:math id="M14"><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>I</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>. Then, we divide the frame data by time step to obtain the average frame <inline-formula><mml:math id="M15"><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>A</mml:mi><mml:mi>V</mml:mi><mml:mi>G</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>&#x00124;</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>&#x00174;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> and replicate the <italic>I</italic><sub><italic>AVG</italic></sub> in the time dimension for <italic>T</italic> times as the input to the spike attention branch. The data <inline-formula><mml:math id="M16"><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>&#x00124;</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>&#x00174;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> undergoes two rounds of convolution and downsampling, followed by another SAR operation to obtain a spike fire rate map, which is then masked with the data in the MLP to facilitate communication between the branch and the backbone network as shown in <xref ref-type="fig" rid="F3">Figure 3a</xref>.</p></sec>
<sec>
<title>3.2.2 Spike-HAR&#x0002B;&#x0002B;</title>
<p>Directly summing event frames along the temporal dimension can efficiently aggregate critical spatial information at a low cost. However, it may fail during significantly prolonged actions. To address this issue, we utilize the spatio-temporal spike tokens <inline-formula><mml:math id="M17"><mml:mrow><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> extracted from the PE block and perform a SAR operation along the temporal dimension. These tokens are subsequently fed into a new spike attention branch, where they undergo two LIF-Conv-BN operations (shown in <xref ref-type="fig" rid="F3">Figure 3c</xref>), followed by averaging along the temporal dimension to produce the attention mask. By leveraging the key features extracted by the PE block, the generated multi-channel mask is more representative. Experiments (Section 4) demonstrate that, although this adjustment increases power consumption by 0.03 mJ and model complexity as the convolution block must handle a larger number of feature channels, it significantly enhances HAR accuracy across various datasets.</p></sec></sec>
<sec>
<title>3.3 Parallel spike-driven transformer</title>
<p>In the previous spiking Transformer architecture (Zhou et al., <xref ref-type="bibr" rid="B79">2022</xref>; Yao et al., <xref ref-type="bibr" rid="B74">2024b</xref>,<xref ref-type="bibr" rid="B72">a</xref>), the output <italic>U</italic><sub><italic>out</italic></sub> of the backbone network is transformed from the input <italic>U</italic><sub><italic>in</italic></sub> consisting of <italic>N</italic> tokens with dimension <italic>D</italic> using two consecutive sub-blocks (one SA and one MLP) with residual connections:</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M18"><mml:mrow><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mtext>FF</mml:mtext></mml:mrow></mml:msub><mml:mover accent='true'><mml:mi>U</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003B2;</mml:mi><mml:mrow><mml:mtext>FF</mml:mtext></mml:mrow></mml:msub><mml:mtext>MLP</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mi mathvariant='script'>S</mml:mi><mml:mi mathvariant='script'>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mover accent='true'><mml:mi>U</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E11"><label>(11)</label><mml:math id="M19"><mml:mrow><mml:mover accent='true'><mml:mi>U</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mtext>SA</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mtext>in</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003B2;</mml:mi><mml:mrow><mml:mtext>SA</mml:mtext></mml:mrow></mml:msub><mml:mtext>SA</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mi mathvariant='script'>S</mml:mi><mml:mi mathvariant='script'>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mtext>in</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where scalar gain weights &#x003B1;<sub>FF</sub>, &#x003B2;<sub>FF</sub>, &#x003B1;<sub>SA</sub>, &#x003B2;<sub>SA</sub> fixed to 1 by default. In our work, to simplify the transformer block, we remove the residual connections in the MLP sub-blocks, obtaining the following output:</p>
<disp-formula id="E12"><label>(12)</label><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">out</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>m</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">in</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">FF</mml:mtext></mml:mrow></mml:msub><mml:mtext class="textrm" mathvariant="normal">MLP</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">SN</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">in</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">SA</mml:mtext></mml:mrow></mml:msub><mml:mtext class="textrm" mathvariant="normal">SA</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">SN</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">in</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>with skip gain &#x003B1;<sub><italic>comb</italic></sub> &#x0003D; 1, and residual gains &#x003B2;<sub>FF</sub> &#x0003D; &#x003B2;<sub>SA</sub> &#x0003D; 1 as default. In the submodule CB-S3A, we first input the spike signals <italic>S</italic><sub>0</sub> into the spike neuron layer to obtain <italic>S</italic>&#x02032;. Then, we use 2D convolution operations to extract spatial information separately, resulting in <italic>Q</italic> and <italic>K</italic>. The acquisition of <italic>V</italic> does not involve convolution operations. After that, we use the spike neuron layer again to transform <italic>Q</italic>, <italic>K</italic>, and <italic>V</italic> into spike tensors <italic>Q</italic><sub><italic>S</italic></sub>, <italic>K</italic><sub><italic>S</italic></sub>, and <italic>V</italic><sub><italic>S</italic></sub>. And the subsequent masking calculation can be represented as follows:</p>
<disp-formula id="E13"><label>(13)</label><mml:math id="M21"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mi>A</mml:mi><mml:mi>S</mml:mi><mml:mi>K</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mi>S</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>g</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>S</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02297;</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mi>S</mml:mi></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mo>=</mml:mo><mml:mi mathvariant='script'>S</mml:mi><mml:mi mathvariant='script'>N</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>S</mml:mi><mml:mi>U</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mi>C</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>&#x02297;</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>S</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02297;</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mi>S</mml:mi></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x02297; denotes the Hadamard product, <italic>g</italic>(&#x000B7;) is used to compute the attention map, and <italic>SUM</italic><sub><italic>C</italic></sub> is used to calculate the sum of each column. The outputs of <italic>g</italic>(&#x000B7;) and <italic>SUM</italic><sub><italic>C</italic></sub> are row vectors of dimension <italic>D</italic>. Additionally, the Hadamard product between pulse tensors is equivalent to mask computation.</p></sec></sec>
<sec id="s4">
<title>4 Experimental evaluation</title>
<sec>
<title>4.1 Dataset</title>
<p>We evaluate our models on three public datasets, all generated by recording actions in real scenes. SL-Animals-DVS (Vasudevan et al., <xref ref-type="bibr" rid="B56">2022</xref>) and DVS128 Gesture (Amir et al., <xref ref-type="bibr" rid="B3">2017</xref>) were captured by a 128 &#x000D7; 128 pixel DVS128 camera, while DailyAction-DVS (Liu et al., <xref ref-type="bibr" rid="B33">2021</xref>) was captured by a DAVIS346 camera with a spatial resolution of 346 &#x000D7; 260. Furthermore, we also tested our models using the N-LSA64 dataset which is transformed from LSA64 (Ronchetti et al., <xref ref-type="bibr" rid="B43">2023</xref>) dataset using v2e (Hu et al., <xref ref-type="bibr" rid="B23">2021</xref>) method.</p>
<sec>
<title>4.1.1 SL-Animals-DVS</title>
<p>In the SL-Animals-DVS dataset 59 individuals were recorded separately, and each individual performed 19 signs in sequence. Due to the fact that the recording is conducted in 4 sessions at different locations under different lighting conditions, it can be further divided into SL-Animals-DVS-4sets, which includes four shooting environments, and SL-Animals-DVS-3sets, which includes three shooting environments.</p></sec>
<sec>
<title>4.1.2 DVS128 gesture</title>
<p>The DVS128 Gesture dataset comprises 1,342 recordings of 29 subjects performing 11 different actions (including one rejected class with random gestures) under three different lighting conditions.</p></sec>
<sec>
<title>4.1.3 DailyAction-DVS</title>
<p>The DailyAction-DVS dataset comprises 1,440 recordings of 15 subjects acting 12 different actions, including <italic>bend</italic>, <italic>climb</italic>, <italic>falldown</italic>, <italic>getup</italic>, <italic>jump</italic>, <italic>liedown</italic>, <italic>carrybox</italic>, <italic>run</italic>, <italic>sitdown</italic>, <italic>standup</italic>, <italic>walk</italic> and <italic>pickup</italic>.The actions were captured under two lighting conditions including <italic>naturallight</italic> and <italic>LEDlight</italic>.</p></sec>
<sec>
<title>4.1.4 N-LSA64</title>
<p>The N-LSA64 contains 3,200 DVS videos in which 10 non-expert subjects performed five repetitions of 64 different types of sign language. The symbols were selected from the most commonly used symbols in the LSA lexicon, including verbs and nouns. Depending on the number of hands performing the sign language, we further divide the data into N-LSA64-Right, which includes only right-hand movements, and N-LSA64-Both, which includes movements involving both hands.</p>
<p>We utilize a frame-based representation to preprocess an event stream (Fang et al., <xref ref-type="bibr" rid="B13">2021b</xref>; Yao et al., <xref ref-type="bibr" rid="B71">2021</xref>), transforming it into a sequence of event frames. Suppose the interval between two frames (i.e., temporal resolution) is <italic>dt</italic> and there are <italic>T</italic> frames (i.e., timesteps), the total length of the input event stream is <italic>t</italic><sub><italic>total</italic></sub> &#x0003D; <italic>dt</italic>&#x000D7;<italic>T</italic> milliseconds. After processing these frames with the proposed model, we can obtain a prediction.</p></sec></sec>
<sec>
<title>4.2 Implementation details</title>
<p>We set the number of parallel spike-driven transformer block <italic>L</italic>= 2 in Spike-HAR and Spike-HAR&#x0002B;&#x0002B;. In the DVS128 Gesture datasets, the sample length, time step, and learning rate is set as 6,000 ms, 20 and 1 &#x000D7; <italic>e</italic><sup>&#x02212;3</sup> respectively. In the SL-Animals-DVS and N-LSA64 datasets, the sample length, time step, and learning rate is set as 500 ms, 10 and 1 &#x000D7; <italic>e</italic><sup>&#x02212;4</sup> respectively. In the DailyAction-DVS dataset, the sample length, time step, and learning rate is set as 1,200 ms, 10 and 1 &#x000D7; <italic>e</italic><sup>&#x02212;3</sup> respectively. For the training and evaluation of frame-based methods, if the number of frames contained in each event frames is larger than the timesteps <italic>T</italic>, we linearly sample <italic>T</italic> of them. Otherwise, we pad them to the length of <italic>T</italic> with the zero-padding operation. Spike-HAR and Spike-HAR&#x0002B;&#x0002B; are optimized with AdamW (Loshchilov and Hutter, <xref ref-type="bibr" rid="B35">2017</xref>) optimizer, in a single NVIDIA GeForce RTX 3090. We set the batch size to 32 and trained for 240 epochs using the one cycle learning rate policy (Smith and Topin, <xref ref-type="bibr" rid="B51">2018</xref>). As for the data augmentation, we use spatial and temporal random crop and repeat each sample within the training batch twice with different augmentations. In addition, for the N-LSA64 dataset, we divided the data into training, validation, and test sets in the ratio of 6:2:2, and evaluated the classification accuracy on the test set.</p></sec>
<sec>
<title>4.3 Comparison to the state-of-the-art models</title>
<p>We compare the proposed Spike-HAR and Spike-HAR&#x0002B;&#x0002B; with several relevant action recognition methods, including SNN and ANN. And the results on four datasets are shown in <xref ref-type="table" rid="T1">Tables 1</xref>&#x02013;<xref ref-type="table" rid="T4">4</xref>, respectively. We can find that our proposed models outperform existing action recognition methods, indicating that our proposed models have a stronger ability to extract action information from event data. Specifically, on the SL-Animals-DVS dataset, we compare our models with existing ANN models, SNN models and a hybrid neural network that includes both ANN and SNN components. Additionally, we replace the backbone network in EVT (Sabater et al., <xref ref-type="bibr" rid="B44">2022</xref>) with the Spike-Driven Transformer block (Yao et al., <xref ref-type="bibr" rid="B74">2024b</xref>) to obtain Spike-Evt and conduct model training for comparative analysis. Experimental results on SL-Animals-DVS are given in <xref ref-type="table" rid="T1">Table 1</xref>, from which we can see that the accuracy of Spike-HAR&#x0002B;&#x0002B; is 3.81 and 5.37% higher than that of EVT (Sabater et al., <xref ref-type="bibr" rid="B44">2022</xref>) on the dataset SL-Animals-DVS-4sets and SL-Animals-DVS-3sets, respectively. And compared to the SNN method EventRPG &#x0002B; SEW Resnet18 (Sun et al., <xref ref-type="bibr" rid="B54">2024</xref>), the Spike-HAR&#x0002B;&#x0002B; improves the accuracy by 0.35% on the dataset SL-Animals-DVS-4sets. On the SL-Animals-DVS-3sets, the EventRPG&#x0002B;SEW Resnet18 achieves a higher classification accuracy of 93.30% by using complex data augmentation strategies. In contrast, Spike-HAR&#x0002B;&#x0002B; reaches a similar accuracy of 92.82% with simple data augmentation (Section 4.2) and a more lightweight backbone (Section 4.5, Spike-HAR&#x0002B;&#x0002B; vs. SEW ResNet18). On the N-LSA64-Both and N-LSA64-Right datasets, for comparison with existing methods, we adopt the same sampling and training strategies to train the SOTA ANN model EVT (Sabater et al., <xref ref-type="bibr" rid="B44">2022</xref>), the baseline SNN model STBP (Wu et al., <xref ref-type="bibr" rid="B68">2018</xref>), and Spike-EVT, which is constructed by replacing the EVT backbone with a spiking transformer (Yao et al., <xref ref-type="bibr" rid="B74">2024b</xref>). The test results, presented in <xref ref-type="table" rid="T2">Table 2</xref>, demonstrate that Spike-HAR&#x0002B;&#x0002B; increases accuracy by 1.72% compared to EVT on the N-LSA64-Both dataset and by 5.71% compared to the Spike-driven EVT on the N-LSA64-Right dataset. Furthermore, compared to the other models, Spike-HAR and Spike-HAR&#x0002B;&#x0002B; utilize the shortest sample length of 500 ms. For the DVS128 Gesture, as can be seen in <xref ref-type="table" rid="T3">Table 3</xref>, Spike-HAR and Spike-HAR&#x0002B;&#x0002B; get the classification accuracy of 98.26 and 97.92%, respectively, outperforming other ANN and SNN methods. Finally, as shown in <xref ref-type="table" rid="T4">Table 4</xref>, we compared our results on DailyAction-DVS with state-of-the-art SNN models. Spike-HAR&#x0002B;&#x0002B; achieved the best classification performance, reaching 98.47%, using the sample length of just 1,200 ms.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Classification accuracy in the SL-Animals-DVS dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>Time step</bold></th>
<th valign="top" align="left"><bold>Sample length</bold></th>
<th valign="top" align="left" colspan="2"><bold>SL-Animals-DVS</bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#919498;color:#ffffff">
<td/>
<td/>
<td/>
<td/>
<td valign="top" align="left"><bold>4 sets</bold></td>
<td valign="top" align="left"><bold>3 sets</bold></td>
</tr> <tr>
<td valign="top" align="left">TORE &#x0002B; GoogLeNet (Baldwin et al., <xref ref-type="bibr" rid="B4">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.8510</td>
<td valign="top" align="left">\</td>
</tr> <tr>
<td valign="top" align="left">TORE &#x0002B; ResNet18 (Baldwin et al., <xref ref-type="bibr" rid="B4">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.7690</td>
<td valign="top" align="left">\</td>
</tr> <tr>
<td valign="top" align="left">VoxelGrid &#x0002B; ResNet18 (Zhu et al., <xref ref-type="bibr" rid="B80">2019</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.8902</td>
<td valign="top" align="left">\</td>
</tr> <tr>
<td valign="top" align="left">SITS &#x0002B; ResNet18 (Manderscheid et al., <xref ref-type="bibr" rid="B37">2019</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.7847</td>
<td valign="top" align="left">\</td>
</tr> <tr>
<td valign="top" align="left">VK-SITS &#x0002B; ResNet18 (Acin et al., <xref ref-type="bibr" rid="B2">2023</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.7926</td>
<td valign="top" align="left">\</td>
</tr> <tr>
<td valign="top" align="left">EVT (Sabater et al., <xref ref-type="bibr" rid="B44">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">504 ms</td>
<td valign="top" align="left">0.8812</td>
<td valign="top" align="left">0.8745</td>
</tr> <tr>
<td valign="top" align="left">SCTFA &#x0002B; 7-Layer Spiking CNN (Cai et al., <xref ref-type="bibr" rid="B6">2024</xref>)</td>
<td valign="top" align="left">Hybrid</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9004</td>
<td valign="top" align="left">\</td>
</tr> <tr>
<td valign="top" align="left">SLAYER (Shrestha and Orchard, <xref ref-type="bibr" rid="B50">2018</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">300</td>
<td valign="top" align="left">1,500 ms</td>
<td valign="top" align="left">0.5430</td>
<td valign="top" align="left">0.6141</td>
</tr> <tr>
<td valign="top" align="left">STBP (Wu et al., <xref ref-type="bibr" rid="B68">2018</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">50</td>
<td valign="top" align="left">1,500 ms</td>
<td valign="top" align="left">0.6497</td>
<td valign="top" align="left">0.7147</td>
</tr> <tr>
<td valign="top" align="left">DECOLLE (Kaiser et al., <xref ref-type="bibr" rid="B26">2020</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">500</td>
<td valign="top" align="left">500 ms</td>
<td valign="top" align="left">0.6219</td>
<td valign="top" align="left">0.6219</td>
</tr> <tr>
<td valign="top" align="left">SEW Resnet18 (Fang et al., <xref ref-type="bibr" rid="B12">2021a</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">16</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.8542</td>
<td valign="top" align="left">0.8909</td>
</tr> <tr>
<td valign="top" align="left">EventDrop &#x0002B; SEW ResNet18 (Gu et al., <xref ref-type="bibr" rid="B17">2021</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.8633</td>
<td valign="top" align="left">0.8899</td>
</tr> <tr>
<td valign="top" align="left">NDA &#x0002B; SEW ResNet18 (Li et al., <xref ref-type="bibr" rid="B28">2022</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.8777</td>
<td valign="top" align="left">0.8955</td>
</tr> <tr>
<td valign="top" align="left">EventRPG &#x0002B; SEW ResNet18 (Sun et al., <xref ref-type="bibr" rid="B54">2024</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left"><bold>0.9159</bold></td>
<td valign="top" align="left"><inline-formula><mml:math id="M26"><mml:mrow><mml:mstyle mathcolor="#ee1c23"><mml:mtext>0.9330</mml:mtext></mml:mstyle></mml:mrow></mml:math></inline-formula></td>
</tr> <tr>
<td valign="top" align="left">Spike-Driven EVT (Yao et al., <xref ref-type="bibr" rid="B74">2024b</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">11</td>
<td valign="top" align="left">504 ms</td>
<td valign="top" align="left">0.7939</td>
<td valign="top" align="left">0.6667</td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">500 ms</td>
<td valign="top" align="left">0.8947</td>
<td valign="top" align="left">0.9006</td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR&#x0002B;&#x0002B; (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">500 ms</td>
<td valign="top" align="left"><inline-formula><mml:math id="M27"><mml:mrow><mml:mstyle mathcolor="#ee1c23"><mml:mtext>0.9193</mml:mtext></mml:mstyle></mml:mrow></mml:math></inline-formula></td>
<td valign="top" align="left"><bold>0.9282</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Red and bold indicate the best and second best performance.</p>
</table-wrap-foot>
</table-wrap><table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Classification accuracy in the N-LSA64 dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>Time step</bold></th>
<th valign="top" align="left"><bold>Sample length</bold></th>
<th valign="top" align="left" colspan="2"><bold>N-LSA64</bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#919498;color:#ffffff">
<td/>
<td/>
<td/>
<td/>
<td valign="top" align="left"><bold>Both</bold></td>
<td valign="top" align="left"><bold>Right</bold></td>
</tr> <tr>
<td valign="top" align="left">EVT (Sabater et al., <xref ref-type="bibr" rid="B44">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">504 ms</td>
<td valign="top" align="left">0.8406</td>
<td valign="top" align="left">0.8214</td>
</tr> <tr>
<td valign="top" align="left">STBP (Wu et al., <xref ref-type="bibr" rid="B68">2018</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">50</td>
<td valign="top" align="left">1,500 ms</td>
<td valign="top" align="left">0.5969</td>
<td valign="top" align="left">0.5786</td>
</tr> <tr>
<td valign="top" align="left">Spike-driven EVT (Yao et al., <xref ref-type="bibr" rid="B74">2024b</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">11</td>
<td valign="top" align="left">504 ms</td>
<td valign="top" align="left">0.7266</td>
<td valign="top" align="left">0.8262</td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">500 ms</td>
<td valign="top" align="left"><bold>0.8469</bold></td>
<td valign="top" align="left"><bold>0.8690</bold></td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR&#x0002B;&#x0002B; (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">500 ms</td>
<td valign="top" align="left"><inline-formula><mml:math id="M28"><mml:mrow><mml:mstyle mathcolor="#ee1c23"><mml:mtext>0.8578</mml:mtext></mml:mstyle></mml:mrow></mml:math></inline-formula></td>
<td valign="top" align="left"><inline-formula><mml:math id="M29"><mml:mrow><mml:mstyle mathcolor="#ee1c23"><mml:mtext>0.8833</mml:mtext></mml:mstyle></mml:mrow></mml:math></inline-formula></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Red and bold indicate the best and second best performance.</p>
</table-wrap-foot>
</table-wrap><table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Classification accuracy in the DVS128 Gesture dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>Time step</bold></th>
<th valign="top" align="left"><bold>Sample length</bold></th>
<th valign="top" align="left"><bold>DVS128 Gesture</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">12 layers CNN (Amir et al., <xref ref-type="bibr" rid="B3">2017</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">120</td>
<td valign="top" align="left">120 ms</td>
<td valign="top" align="left">0.9260</td>
</tr> <tr>
<td valign="top" align="left">Identify &#x0002B; Resnet34 (He et al., <xref ref-type="bibr" rid="B20">2016</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9549</td>
</tr> <tr>
<td valign="top" align="left">NDA &#x0002B; Resnet34 (Li et al., <xref ref-type="bibr" rid="B28">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9722</td>
</tr> <tr>
<td valign="top" align="left">EventMix &#x0002B; Resnet34 (Shen et al., <xref ref-type="bibr" rid="B47">2023</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9180</td>
</tr> <tr>
<td valign="top" align="left">ShapeAug &#x0002B; Resnet34 (Bendig et al., <xref ref-type="bibr" rid="B5">2024</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9170</td>
</tr> <tr>
<td valign="top" align="left">EventDrop &#x0002B; Resnet34 (Gu et al., <xref ref-type="bibr" rid="B17">2021</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9618</td>
</tr> <tr>
<td valign="top" align="left">PLIF-SNN (Fang et al., <xref ref-type="bibr" rid="B13">2021b</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">6,000 ms</td>
<td valign="top" align="left">0.9760</td>
</tr> <tr>
<td valign="top" align="left">Res-SNN-18 (Yao et al., <xref ref-type="bibr" rid="B71">2021</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">16</td>
<td valign="top" align="left">6,000 ms</td>
<td valign="top" align="left">0.9790</td>
</tr> <tr>
<td valign="top" align="left">ASA-SNN (Yao et al., <xref ref-type="bibr" rid="B73">2023</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">6,000 ms</td>
<td valign="top" align="left">0.9770</td>
</tr> <tr>
<td valign="top" align="left">Identify &#x0002B; SEW Resnet18 (Fang et al., <xref ref-type="bibr" rid="B12">2021a</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9433</td>
</tr> <tr>
<td valign="top" align="left">Eventmix &#x0002B; SEW Resnet18 (Shen et al., <xref ref-type="bibr" rid="B47">2023</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9675</td>
</tr> <tr>
<td valign="top" align="left">EventRPG &#x0002B; SEW Resnet18 (Sun et al., <xref ref-type="bibr" rid="B54">2024</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9653</td>
</tr> <tr>
<td valign="top" align="left">Identify &#x0002B; CSNN (Xu et al., <xref ref-type="bibr" rid="B70">2018</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9375</td>
</tr> <tr>
<td valign="top" align="left">NDA &#x0002B; CSNN (Li et al., <xref ref-type="bibr" rid="B28">2022</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9583</td>
</tr> <tr>
<td valign="top" align="left">EventAugmentation &#x0002B; CSNN (Gu et al., <xref ref-type="bibr" rid="B16">2024</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9625</td>
</tr> <tr>
<td valign="top" align="left">EventDrop &#x0002B; CSNN (Gu et al., <xref ref-type="bibr" rid="B17">2021</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9444</td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">6,000 ms</td>
<td valign="top" align="left"><inline-formula><mml:math id="M30"><mml:mrow><mml:mstyle mathcolor="#ee1c23"><mml:mtext>0.9826</mml:mtext></mml:mstyle></mml:mrow></mml:math></inline-formula></td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR&#x0002B;&#x0002B; (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">6,000 ms</td>
<td valign="top" align="left"><bold>0.9792</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Red and bold indicate the best and second best performance.</p>
</table-wrap-foot>
</table-wrap><table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Classification accuracy in the DailyAction-DVS dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>Time step</bold></th>
<th valign="top" align="left"><bold>Sample length</bold></th>
<th valign="top" align="left"><bold>DailyAction-DVS</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Gabor-Tempotron SNN (Xiao et al., <xref ref-type="bibr" rid="B69">2019</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.6830</td>
</tr> <tr>
<td valign="top" align="left">HMAX-SNN (Liu Q. et al., <xref ref-type="bibr" rid="B32">2020</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.7690</td>
</tr> <tr>
<td valign="top" align="left">Motion-SNN (Liu et al., <xref ref-type="bibr" rid="B33">2021</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9030</td>
</tr> <tr>
<td valign="top" align="left">PLIF-SNN (Fang et al., <xref ref-type="bibr" rid="B13">2021b</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">36</td>
<td valign="top" align="left">4,320 ms</td>
<td valign="top" align="left">0.9250</td>
</tr> <tr>
<td valign="top" align="left">ASA-SNN (Yao et al., <xref ref-type="bibr" rid="B73">2023</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">36</td>
<td valign="top" align="left">4,320 ms</td>
<td valign="top" align="left">0.9460</td>
</tr> <tr>
<td valign="top" align="left">EHTI &#x00026; MDTS-Tempotron SNN (Ding et al., <xref ref-type="bibr" rid="B9">2024</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">\</td>
<td valign="top" align="left">0.9608</td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">1,200 ms</td>
<td valign="top" align="left"><bold>0.9826</bold></td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR&#x0002B;&#x0002B; (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">1,200 ms</td>
<td valign="top" align="left"><inline-formula><mml:math id="M31"><mml:mrow><mml:mstyle mathcolor="#ee1c23"><mml:mtext>0.9847</mml:mtext></mml:mstyle></mml:mrow></mml:math></inline-formula></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Red and bold indicate the best and second best performance.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.4 Ablation study</title>
<p>In this section, we analyze the impact of hyperparameters and the key components of Spike-HAR and Spike-HAR&#x0002B;&#x0002B;. Experiments are conducted on the SL-Animals-DVS-4sets dataset. With a fixed total sample length of 500 ms, different time steps are set to investigate the impact of the number of input event frames and transformer blocks on the model results. As can be seen in <xref ref-type="fig" rid="F4">Figure 4</xref>, with the number of time steps and the number of MLP Blocks increasing, the test accuracy of the model does not change significantly, but with the number of time steps increasing to be more than 20 or the number of MLP blocks decreasing to be 1, the test accuracy will have a significant decrease. Specifically, the highest accuracy of 89.47% for Spike-HAR and 91.93% for Spike-HAR&#x0002B;&#x0002B; are achieved by setting the time step to 10 and the number of blocks to 2. On the other hand, the accuracy decreases to 81.72% for Spike-HAR and 87.72% for Spike-HAR&#x0002B;&#x0002B; when the time step is set to 25, and setting the number of blocks to 1 results in an accuracy of 84.65% for Spike-HAR and 90.88% for Spike-HAR&#x0002B;&#x0002B;. In addition, The experimental results verify the parallel structure and the attention appearance used in proposed models. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, using both parallel transformers and attention brunch simultaneously yields the best accuracy in Spike-HAR and Spike-HAR&#x0002B;&#x0002B;.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p><bold>(a)</bold> Accuracy of Spike-HAR (green line) and Spike-HAR&#x0002B;&#x0002B; (blue line) at varying timesteps <italic>T</italic> (with 2 MLP blocks); <bold>(b)</bold> Accuracy of Spike-HAR (green line) and Spike-HAR&#x0002B;&#x0002B; (blue line) with different numbers of MLP blocks (with <italic>T</italic> &#x0003D; 10).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-18-1508297-g0004.tif"/>
</fig><table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Accuracy of Spike-HAR and Spike-HAR&#x0002B;&#x0002B; for different architecture on SL-animals-DVS-4sets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Models</bold></th>
<th valign="top" align="left"><bold>Attention brunch</bold></th>
<th valign="top" align="left"><bold>Serial transformer block</bold></th>
<th valign="top" align="left"><bold>Parallel transformer block</bold></th>
<th valign="top" align="left"><bold>Accuracy</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Spike-HAR</td>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td/>
<td valign="top" align="left">0.8640</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left">&#x02713;</td>
<td/>
<td valign="top" align="left">0.8465</td>
</tr>
 <tr>
<td/>
<td/>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left">0.8421</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left">0.8947</td>
</tr> <tr>
<td valign="top" align="left">Spike-HAR&#x0002B;&#x0002B;</td>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td/>
<td valign="top" align="left">0.8640</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left">&#x02713;</td>
<td/>
<td valign="top" align="left">0.9088</td>
</tr>
 <tr>
<td/>
<td/>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left">0.8421</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left">0.9193</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>4.5 Energy consumption analysis</title>
<p>We use the SL-Animals-DVS dataset to estimate the energy required for proposed models to classify a DVS sign language video. We first determine the number of operations [SOPs (Zhou et al., <xref ref-type="bibr" rid="B79">2022</xref>) for the SNN module] needed to complete this task:</p>
<disp-formula id="E14"><label>(14)</label><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mo class="qopname">FLOPs</mml:mo></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>2</mml:mn><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E15"><label>(15)</label><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mo class="qopname">SOPs</mml:mo></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>2</mml:mn><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mi>r</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mo class="qopname">FLOPs</mml:mo></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>2</mml:mn><mml:mi>D</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>k</italic><sub><italic>n</italic></sub> is the kernel size, (<italic>t</italic><sub><italic>n</italic></sub>, <italic>h</italic><sub><italic>n</italic></sub>, <italic>w</italic><sub><italic>n</italic></sub>) is the output feature map size, <italic>c</italic><sub><italic>n</italic>&#x02212;1</sub> and <italic>c</italic><sub><italic>n</italic></sub> are the input and output channel numbers, respectively. <italic>fr</italic> and <italic>T</italic><sub><italic>s</italic></sub> denote the spike fire rate and timesteps, respectively. The <italic>fr</italic> is defined as the proportion of non-zero elements within the spike tensor. Practically, we set <italic>T</italic><sub><italic>s</italic></sub> to 10. Once SOPs for the SNN module are determined, we can further obtain the final energy consumption <italic>E</italic> by multiplying the SOPs with the platform&#x00027;s energy:</p>
<disp-formula id="E16"><label>(16)</label><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mo class="qopname">SOPs</mml:mo></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mo class="qopname">AC</mml:mo></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mo class="qopname">SOPs</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>We use the same energy efficiency calculation scheme proposed by Hu Y. et al. (<xref ref-type="bibr" rid="B24">2023</xref>). The energy consumption is 12.5 pJ for each floating-point operation (FLOP) and is 77 fJ for each synaptic operation (SOP). As shown in <xref ref-type="table" rid="T6">Table 6</xref>, the Spike-HAR processes DVS frame data with a spatial size of 96 &#x000D7; 96 and a time step of 10 with only 0.03 mJ of power consumption. This represents a 99.27% energy reduction compared to EVT and is substantially lower than that of other baseline models. Furthermore, although Spike-HAR&#x0002B;&#x0002B; has a higher power consumption compared to Spike-HAR (0.06 vs. 0.03 mJ), it is still lower than that of other models and achieves higher performance than Spike-HAR across the SL-Animals-DVS, N-LSA64, DVS128 Gesture, and DailyAction-DVS datasets.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Computational complexity comparisons of SLR methods.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>&#x00023;Params</bold>.</th>
<th valign="top" align="left"><bold>FLOPs/SOPs</bold></th>
<th valign="top" align="left"><bold>Power/mJ</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">TORE &#x0002B; ResNet18 (Baldwin et al., <xref ref-type="bibr" rid="B4">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">11.69 M</td>
<td valign="top" align="left">3.66 G</td>
<td valign="top" align="left">45.75</td>
</tr> <tr>
<td valign="top" align="left">TORE &#x0002B; GoogLeNet (Baldwin et al., <xref ref-type="bibr" rid="B4">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">8.46 M</td>
<td valign="top" align="left">2.88 G</td>
<td valign="top" align="left">36.00</td>
</tr> <tr>
<td valign="top" align="left">EVT (Sabater et al., <xref ref-type="bibr" rid="B44">2022</xref>)</td>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">0.50 M</td>
<td valign="top" align="left">0.33 G</td>
<td valign="top" align="left">4.13</td>
</tr> <tr>
<td valign="top" align="left">Spike-driven EVT (Yao et al., <xref ref-type="bibr" rid="B74">2024b</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">66.34 M</td>
<td valign="top" align="left">6.77 G</td>
<td valign="top" align="left">0.52</td>
</tr> <tr>
<td valign="top" align="left">SEW Resnet18 (Fang et al., <xref ref-type="bibr" rid="B12">2021a</xref>)</td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">2.92 M</td>
<td valign="top" align="left">1.41 G</td>
<td valign="top" align="left">0.11</td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">0.70 M</td>
<td valign="top" align="left">0.44 G</td>
<td valign="top" align="left">0.03</td>
</tr> <tr>
<td valign="top" align="left"><bold>Spike-HAR&#x0002B;&#x0002B; (Ours)</bold></td>
<td valign="top" align="left">SNN</td>
<td valign="top" align="left">1.80 M</td>
<td valign="top" align="left">0.74 G</td>
<td valign="top" align="left">0.06</td>
</tr></tbody>
</table>
</table-wrap>
</sec></sec>
<sec sec-type="conclusions" id="s5">
<title>5 Conclusion</title>
<p>In this paper, we proprse an energy-efficient and lightweight Spike-HAR family for event-based human action recognition, to adaptively emphasize on local spatial features as well as temporal features. Spike-HAR and Spike-HAR&#x0002B;&#x0002B; surpass existing methods in accuracy on the SL-Animals-DVS, N-LSA64, DVS128 Gesture, and DailyAction-DVS datasets. Furthermore, Spike-HAR and Spike-HAR&#x0002B;&#x0002B; require only 0.03 and 0.06 mJ to recognize a single action event stream, reducing the power consumption of 99.27 and 98.55% compared to the Evt, respectively. It demonstrates the applicability of spiking transformers for human action recognition and their potential application in human-machine interaction and edge HAR devices. In the future, it is promising to develop a more complex large-scale event-based HAR benchmark to further evaluate the performance of the Spike-HAR family in practical applications.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: SL-Animals-DVS (<ext-link ext-link-type="uri" xlink:href="http://www2.imse-cnm.csic.es/neuromorphs/index.php/SL-ANIMALS-DVS-Database">http://www2.imse-cnm.csic.es/neuromorphs/index.php/SL-ANIMALS-DVS-Database</ext-link>); LSA64 (<ext-link ext-link-type="uri" xlink:href="https://facundoq.github.io/datasets/lsa64/">https://facundoq.github.io/datasets/lsa64/</ext-link>); DVS128 Gesture (<ext-link ext-link-type="uri" xlink:href="https://ibm.ent.box.com/s/3hiq58ww1pbbjrinh367ykfdf60xsfm8/folder/50167556794">https://ibm.ent.box.com/s/3hiq58ww1pbbjrinh367ykfdf60xsfm8/folder/50167556794</ext-link>); DailyAction-DVS (<ext-link ext-link-type="uri" xlink:href="https://github.com/qianhuiliu/SNN-action-recognition">https://github.com/qianhuiliu/SNN-action-recognition</ext-link>).</p>
</sec>
<sec sec-type="ethics-statement" id="s7">
<title>Ethics statement</title>
<p>Written informed consent was not obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article because the photographs appearing in the manuscript are sourced from publicly available datasets.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>XL: Data curation, Investigation, Methodology, Validation, Visualization, Writing &#x02013; original draft. ML: Data curation, Investigation, Methodology, Validation, Visualization, Writing &#x02013; original draft. HC: Conceptualization, Funding acquisition, Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Natural Science Foundation of China under Grant 92164110 and 62334014.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that Generative AI was used in the creation of this manuscript. During the preparation of this work the author(s) used GPT-4 in order to polish the content. After using this tool/service, the author(s) reviewed and edited the content as needed and take full responsibility for the content of the publication.</p></sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Achiam</surname> <given-names>J.</given-names></name> <name><surname>Adler</surname> <given-names>S.</given-names></name> <name><surname>Agarwal</surname> <given-names>S.</given-names></name> <name><surname>Ahmad</surname> <given-names>L.</given-names></name> <name><surname>Akkaya</surname> <given-names>I.</given-names></name> <name><surname>Aleman</surname> <given-names>F. L.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Gpt-4 technical report</article-title>. <source>arXiv</source> [Preprint]. arXiv:2303.08774. <pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Acin</surname> <given-names>L.</given-names></name> <name><surname>Jacob</surname> <given-names>P.</given-names></name> <name><surname>Simon-Chane</surname> <given-names>C.</given-names></name> <name><surname>Histace</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;VK-sits: a robust time-surface for fast event-based recognition,&#x0201D;</article-title> in <source>2023 Twelfth International Conference on Image Processing Theory, Tools and Applications (IPTA)</source> (<publisher-loc>Paris</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/IPTA59101.2023.10320049</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Amir</surname> <given-names>A.</given-names></name> <name><surname>Taba</surname> <given-names>B.</given-names></name> <name><surname>Berg</surname> <given-names>D.</given-names></name> <name><surname>Melano</surname> <given-names>T.</given-names></name> <name><surname>McKinstry</surname> <given-names>J.</given-names></name> <name><surname>Di Nolfo</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;A low power, fully event-based gesture recognition system,&#x0201D;</article-title> in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source> (<publisher-loc>Honolulu, HI</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>7243</fpage>&#x02013;<lpage>7252</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2017.781</pub-id><pub-id pub-id-type="pmid">32903824</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Baldwin</surname> <given-names>R. W.</given-names></name> <name><surname>Liu</surname> <given-names>R.</given-names></name> <name><surname>Almatrafi</surname> <given-names>M.</given-names></name> <name><surname>Asari</surname> <given-names>V.</given-names></name> <name><surname>Hirakawa</surname> <given-names>K.</given-names></name></person-group> (<year>2022</year>). <article-title>Time-ordered recent event (TORE) volumes for event cameras</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>45</volume>, <fpage>2519</fpage>&#x02013;<lpage>2532</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3172212</pub-id><pub-id pub-id-type="pmid">35503820</pub-id></citation></ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bendig</surname> <given-names>K.</given-names></name> <name><surname>Schuster</surname> <given-names>R.</given-names></name> <name><surname>Stricker</surname> <given-names>D.</given-names></name></person-group> (<year>2024</year>). <article-title>Shapeaug: occlusion augmentation for event camera data</article-title>. <source>arXiv</source> [Preprint]. arXiv:2401.02274. <pub-id pub-id-type="doi">10.48550/arXiv.2401.02274</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>W.</given-names></name> <name><surname>Sun</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>R.</given-names></name> <name><surname>Cui</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Xia</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>A spatial-channel-temporal-fused attention for spiking neural networks</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <volume>35</volume>, <fpage>14315</fpage>&#x02013;<lpage>14329</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2023.3278265</pub-id><pub-id pub-id-type="pmid">37256807</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cannici</surname> <given-names>M.</given-names></name> <name><surname>Ciccone</surname> <given-names>M.</given-names></name> <name><surname>Romanoni</surname> <given-names>A.</given-names></name> <name><surname>Matteucci</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;A differentiable recurrent surface for asynchronous event-based data,&#x0201D;</article-title> in <source>Computer Vision</source> &#x02013; <italic>ECCV 2020: 16th European Conference, Glasgow, UK, August 23&#x02013;28, 2020, Proceedings, Part XX</italic> (Berlin: Springer-Verlag), <fpage>136</fpage>&#x02013;<lpage>152</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-58565-5_9</pub-id></citation>
</ref>
<ref id="B8">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Corporation</surname> <given-names>I.</given-names></name></person-group> (<year>2023</year>). <source>Intel stratix 10 tx device overview</source>. Available at: <ext-link ext-link-type="uri" xlink:href="https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/hb/stratix-10/s10_tx_overview.pdf">https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/hb/stratix-10/s10_tx_overview.pdf</ext-link> (accessed December 10, 2023).</citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>H.</given-names></name> <name><surname>Jiang</surname> <given-names>J.</given-names></name> <name><surname>Yan</surname> <given-names>R.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;A time-surface enhancement model for event-based spatiotemporal feature extraction,&#x0201D;</article-title> in <source>2024 International Joint Conference on Neural Networks (IJCNN)</source> (<publisher-loc>Yokohama IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1109/IJCNN60899.2024.10650047</pub-id><pub-id pub-id-type="pmid">22782131</pub-id></citation></ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>M.</given-names></name> <name><surname>Ding</surname> <given-names>Y.</given-names></name> <name><surname>Wei</surname> <given-names>L.</given-names></name> <name><surname>Xu</surname> <given-names>Y.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Individual surveillance around parked aircraft at nighttime: thermal infrared vision-based human action recognition</article-title>. <source>IEEE Trans. Syst. Man Cybern. Syst</source>. <volume>53</volume>, <fpage>1084</fpage>&#x02013;<lpage>1094</lpage>. <pub-id pub-id-type="doi">10.1109/TSMC.2022.3192017</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fang</surname> <given-names>W.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Ding</surname> <given-names>J.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Masquelier</surname> <given-names>T.</given-names></name> <name><surname>Chen</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Spikingjelly: an open-source machine learning infrastructure platform for spike-based intelligence</article-title>. <source>Sci. Adv</source>. <volume>9</volume>:<fpage>eadi1480</fpage>. <pub-id pub-id-type="doi">10.1126/sciadv.adi1480</pub-id><pub-id pub-id-type="pmid">37801497</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>T.</given-names></name> <name><surname>Masquelier</surname> <given-names>T.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2021a</year>). <article-title>Deep residual learning in spiking neural networks</article-title>. <source>Adv. Neural Inf. Process. Syst</source>. <volume>34</volume>, <fpage>21056</fpage>&#x02013;<lpage>21069</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2102.04159</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Fang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Masquelier</surname> <given-names>T.</given-names></name> <name><surname>Huang</surname> <given-names>T.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2021b</year>). <article-title>&#x0201C;Incorporating learnable membrane time constant to enhance learning of spiking neural networks,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF international conference on computer vision</source> (<publisher-loc>Montreal, QC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2661</fpage>&#x02013;<lpage>2671</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00266</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Lu</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>S.</given-names></name> <name><surname>Ma</surname> <given-names>N.</given-names></name> <name><surname>Du</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Action recognition and benchmark using event cameras</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>45</volume>, <fpage>14081</fpage>&#x02013;<lpage>14097</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2023.3300741</pub-id><pub-id pub-id-type="pmid">37527291</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ghosh</surname> <given-names>R.</given-names></name> <name><surname>Gupta</surname> <given-names>A.</given-names></name> <name><surname>Nakagawa</surname> <given-names>A.</given-names></name> <name><surname>Soares</surname> <given-names>A.</given-names></name> <name><surname>Thakor</surname> <given-names>N.</given-names></name></person-group> (<year>2019</year>). <article-title>Spatiotemporal filtering for event-based action recognition</article-title>. <source>arXiv</source> [Preprint] arXiv:1903.07067. <pub-id pub-id-type="doi">10.48550/arXiv.1903.07067</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>F.</given-names></name> <name><surname>Dou</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Long</surname> <given-names>X.</given-names></name> <name><surname>Guo</surname> <given-names>S.</given-names></name> <name><surname>Chen</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Eventaugment: learning augmentation policies from asynchronous event-based data</article-title>. <source>IEEE Trans. Cogn. Dev. Syst</source>. <volume>16</volume>, <fpage>1521</fpage>&#x02013;<lpage>1532</lpage>. <pub-id pub-id-type="doi">10.1109/TCDS.2024.3380907</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>F.</given-names></name> <name><surname>Sng</surname> <given-names>W.</given-names></name> <name><surname>Hu</surname> <given-names>X.</given-names></name> <name><surname>Yu</surname> <given-names>F.</given-names></name></person-group> (<year>2021</year>). <article-title>Eventdrop: data augmentation for event-based learning</article-title>. <source>arXiv</source> [Preprint]. arXiv:2106.05836. <pub-id pub-id-type="doi">10.48550/arXiv.2106.05836</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>L.</given-names></name> <name><surname>Gao</surname> <given-names>Z.</given-names></name> <name><surname>Qu</surname> <given-names>J.</given-names></name> <name><surname>Zheng</surname> <given-names>S.</given-names></name> <name><surname>Jiang</surname> <given-names>R.</given-names></name> <name><surname>Lu</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Transformer-based spiking neural networks for multimodal audio-visual classification</article-title>. <source>IEEE Trans. Cogn. Dev. Syst</source>. <volume>16</volume>, <fpage>1077</fpage>&#x02013;<lpage>1086</lpage>. <pub-id pub-id-type="doi">10.1109/TCDS.2023.3327081</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>K.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name> <name><surname>Guo</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A survey on vision transformer</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>45</volume>, <fpage>87</fpage>&#x02013;<lpage>110</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3152247</pub-id><pub-id pub-id-type="pmid">35180075</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Deep residual learning for image recognition,&#x0201D;</article-title> in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>770</fpage>&#x02013;<lpage>778</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>L.</given-names></name> <name><surname>Gao</surname> <given-names>L.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Feng</surname> <given-names>W.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Continuous sign language recognition with correlation network,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2529</fpage>&#x02013;<lpage>2539</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00249</pub-id><pub-id pub-id-type="pmid">37998082</pub-id></citation></ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name></person-group> (<year>2024</year>). <article-title>Advancing spiking neural networks toward deep residual learning</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. 1&#x02013;15. <pub-id pub-id-type="doi">10.1109/TNNLS.2024.3355393</pub-id><pub-id pub-id-type="pmid">38329859</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>S.-C.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;v2e: From video frames to realistic dvs events,&#x0201D;</article-title> in <source>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1312</fpage>&#x02013;<lpage>1321</lpage>. <pub-id pub-id-type="doi">10.1109/CVPRW53098.2021.00144</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>Spiking deep residual networks</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <volume>34</volume>, <fpage>5200</fpage>&#x02013;<lpage>5205</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3119238</pub-id><pub-id pub-id-type="pmid">34723807</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Innocenti</surname> <given-names>S. U.</given-names></name> <name><surname>Becattini</surname> <given-names>F.</given-names></name> <name><surname>Pernici</surname> <given-names>F.</given-names></name> <name><surname>Del Bimbo</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Temporal binary representation for event-based action recognition,&#x0201D;</article-title> in <source>2020 25th International Conference on Pattern Recognition (ICPR)</source> (<publisher-loc>Milan</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10426</fpage>&#x02013;<lpage>10432</lpage>. <pub-id pub-id-type="doi">10.1109/ICPR48806.2021.9412991</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kaiser</surname> <given-names>J.</given-names></name> <name><surname>Mostafa</surname> <given-names>H.</given-names></name> <name><surname>Neftci</surname> <given-names>E.</given-names></name></person-group> (<year>2020</year>). <article-title>Synaptic plasticity dynamics for deep continuous local learning (decolle)</article-title>. <source>Front. Neurosci</source>. <volume>14</volume>:<fpage>515306</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2020.00424</pub-id><pub-id pub-id-type="pmid">32477050</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>K&#x00131;nd&#x00131;roglu</surname> <given-names>A.</given-names></name> <name><surname>&#x000D6;zdemir</surname> <given-names>O.</given-names></name> <name><surname>Akarun</surname> <given-names>L.</given-names></name></person-group> (<year>2022</year>). <article-title>Aligning accumulative representations for sign language recognition</article-title>. <source>Mach. Vis. Appl</source>. <volume>34</volume>, <fpage>1</fpage>&#x02013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1007/s00138-022-01367-x</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Kim</surname> <given-names>Y.</given-names></name> <name><surname>Park</surname> <given-names>H.</given-names></name> <name><surname>Geller</surname> <given-names>T.</given-names></name> <name><surname>Panda</surname> <given-names>P.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Neuromorphic data augmentation for training spiking neural networks,&#x0201D;</article-title> in <source>European Conference on Computer Vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>631</fpage>&#x02013;<lpage>649</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-20071-7_37</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>W.</given-names></name> <name><surname>Sun</surname> <given-names>M.-T.</given-names></name> <name><surname>Poovandran</surname> <given-names>R.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name></person-group> (<year>2008</year>). <article-title>&#x0201C;Human activity recognition for video surveillance,&#x0201D;</article-title> in <source>2008 IEEE international symposium on circuits and systems (ISCAS)</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2737</fpage>&#x02013;<lpage>2740</lpage>. <pub-id pub-id-type="doi">10.1109/ISCAS.2008.4542023</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Liu</surname> <given-names>K.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Spike-slr: an energy-efficient parallel spiking transformer for event-based sign language recognition,&#x0201D;</article-title> in <source>BMVC 2024</source> - <italic>2024 The British Machine Vision Conference (BMVC)</italic> (Glasgow).</citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Tang</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Qi</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>S.</given-names></name> <name><surname>Wang</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Spiking-physformer: camera-based remote photoplethysmography with parallel spike-driven transformer</article-title>. <source>arXiv</source> [Preprint]. arXiv:<volume>2402</volume>:<fpage>04798</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2402:04798</pub-id></citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Q.</given-names></name> <name><surname>Ruan</surname> <given-names>H.</given-names></name> <name><surname>Xing</surname> <given-names>D.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>Effective aer object classification using segmented probability-maximization learning in spiking neural networks</article-title>. <source>Proc. AAAI Conf. Artif. Intell</source>. <volume>34</volume>, <fpage>1308</fpage>&#x02013;<lpage>1315</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v34i02.5486</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Q.</given-names></name> <name><surname>Xing</surname> <given-names>D.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Ma</surname> <given-names>D.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Event-based action recognition using motion information and spiking neural networks,&#x0201D;</article-title> in <source>International Joint Conferences on Artificial Intelligence Organization, Virtual conference (IJCAI)</source>, 1743&#x02013;1749. <pub-id pub-id-type="doi">10.24963/ijcai.2021/240</pub-id><pub-id pub-id-type="pmid">28381998</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Fromm</surname> <given-names>J.</given-names></name> <name><surname>Patel</surname> <given-names>S.</given-names></name> <name><surname>McDuff</surname> <given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Multi-task temporal shift attention networks for on-device contactless vitals measurement,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems, Vol. 33</source>, eds. H. Larochelle, M. Ranzato, R. Hadsell, M. Balcan, and H. Lin (Red Hook, NY: Curran Associates, Inc), <fpage>19400</fpage>&#x02013;<lpage>19411</lpage>.</citation>
</ref>
<ref id="B35">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Loshchilov</surname> <given-names>I.</given-names></name> <name><surname>Hutter</surname> <given-names>F.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Decoupled weight decay regularization,&#x0201D;</article-title> in <source>International Conference on Learning Representations</source> (<publisher-loc>Toulon</publisher-loc>).<pub-id pub-id-type="pmid">38536692</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Maass</surname> <given-names>W.</given-names></name></person-group> (<year>1997</year>). <article-title>Networks of spiking neurons: the third generation of neural network models</article-title>. <source>Neural Netw</source>. <volume>10</volume>, <fpage>1659</fpage>&#x02013;<lpage>1671</lpage>. <pub-id pub-id-type="doi">10.1016/S0893-6080(97)00011-7</pub-id></citation>
</ref>
<ref id="B37">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Manderscheid</surname> <given-names>J.</given-names></name> <name><surname>Sironi</surname> <given-names>A.</given-names></name> <name><surname>Bourdis</surname> <given-names>N.</given-names></name> <name><surname>Migliore</surname> <given-names>D.</given-names></name> <name><surname>Lepetit</surname> <given-names>V.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Speed invariant time surface for learning to detect corner points with event-based cameras,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10245</fpage>&#x02013;<lpage>10254</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2019.01049</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mercanoglu Sincan</surname> <given-names>O.</given-names></name> <name><surname>Keles</surname> <given-names>H. Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Using motion history images with 3D convolutional networks in isolated sign language recognition</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>18608</fpage>&#x02013;<lpage>18618</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3151362</pub-id></citation>
</ref>
<ref id="B39">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Murray</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <source>World federation of the deaf</source> . Available at: <ext-link ext-link-type="uri" xlink:href="http://wfdeaf.org/our-work/">http://wfdeaf.org/our-work/</ext-link> (accessed May 08, 2024).</citation>
</ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nasir</surname> <given-names>I. M.</given-names></name> <name><surname>Raza</surname> <given-names>M.</given-names></name> <name><surname>Shah</surname> <given-names>J. H.</given-names></name> <name><surname>Wang</surname> <given-names>S.-H.</given-names></name> <name><surname>Tariq</surname> <given-names>U.</given-names></name> <name><surname>Khan</surname> <given-names>M. A.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Harednet: a deep learning based architecture for autonomous video surveillance by recognizing human actions</article-title>. <source>Comput. Electr. Eng</source>. <volume>99</volume>:<fpage>107805</fpage>. <pub-id pub-id-type="doi">10.1016/j.compeleceng.2022.107805</pub-id></citation>
</ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Prati</surname> <given-names>A.</given-names></name> <name><surname>Shan</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>K. I.-K.</given-names></name></person-group> (<year>2019</year>). <article-title>Sensors, vision and networks: from video surveillance to activity recognition and health monitoring</article-title>. <source>J. Ambient Intell. Smart Environ</source>. <volume>11</volume>, <fpage>5</fpage>&#x02013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.3233/AIS-180510</pub-id></citation>
</ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qiao</surname> <given-names>N.</given-names></name> <name><surname>Mostafa</surname> <given-names>H.</given-names></name> <name><surname>Corradi</surname> <given-names>F.</given-names></name> <name><surname>Osswald</surname> <given-names>M.</given-names></name> <name><surname>Stefanini</surname> <given-names>F.</given-names></name> <name><surname>Sumislawska</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>A reconfigurable on-line learning spiking neuromorphic processor comprising 256 neurons and 128k synapses</article-title>. <source>Front. Neurosci</source>. <volume>9</volume>:<fpage>123487</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2015.00141</pub-id><pub-id pub-id-type="pmid">25972778</pub-id></citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ronchetti</surname> <given-names>F.</given-names></name> <name><surname>Quiroga</surname> <given-names>F. M.</given-names></name> <name><surname>Estrebou</surname> <given-names>C.</given-names></name> <name><surname>Lanzarini</surname> <given-names>L.</given-names></name> <name><surname>Rosete</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Lsa64: an argentinian sign language dataset</article-title>. <source>arXiv</source> [Preprint]. arXiv:2310.17429. <pub-id pub-id-type="doi">10.48550/arXiv.2310.17429</pub-id><pub-id pub-id-type="pmid">36315481</pub-id></citation></ref>
<ref id="B44">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sabater</surname> <given-names>A.</given-names></name> <name><surname>Montesano</surname> <given-names>L.</given-names></name> <name><surname>Murillo</surname> <given-names>A. C.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Event transformer. a sparse-aware solution for efficient event data processing,&#x0201D;</article-title> in <source>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</source> (<publisher-loc>New Orleans, LA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2676</fpage>&#x02013;<lpage>2685</lpage>. <pub-id pub-id-type="doi">10.1109/CVPRW56347.2022.00301</pub-id></citation>
</ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sabater</surname> <given-names>A.</given-names></name> <name><surname>Montesano</surname> <given-names>L.</given-names></name> <name><surname>Murillo</surname> <given-names>A. C.</given-names></name></person-group> (<year>2023</year>). <article-title>Event transformer&#x0002B;. a multi-purpose solution for efficient event data processing</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>45</volume>, <fpage>16013</fpage>&#x02013;<lpage>16020</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2023.3311336</pub-id><pub-id pub-id-type="pmid">37656643</pub-id></citation></ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sahoo</surname> <given-names>S. P.</given-names></name> <name><surname>Ari</surname> <given-names>S.</given-names></name> <name><surname>Mahapatra</surname> <given-names>K.</given-names></name> <name><surname>Mohanty</surname> <given-names>S. P.</given-names></name></person-group> (<year>2020</year>). <article-title>Har-depth: a novel framework for human action recognition using sequential learning and depth estimated history images</article-title>. <source>IEEE Trans. Emerg. Top. Comput. Intell</source>. <volume>5</volume>, <fpage>813</fpage>&#x02013;<lpage>825</lpage>. <pub-id pub-id-type="doi">10.1109/TETCI.2020.3014367</pub-id></citation>
</ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>G.</given-names></name> <name><surname>Zhao</surname> <given-names>D.</given-names></name> <name><surname>Zeng</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>Eventmix: an efficient data augmentation strategy for event-based learning</article-title>. <source>Inf. Sci</source>. <volume>644</volume>:<fpage>119170</fpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2023.119170</pub-id></citation>
</ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>X.</given-names></name> <name><surname>Zheng</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Stepnet: apatial-temporal part-aware network for isolated sign language recognition</article-title>. <source>ACM Trans. Multimedia Comput. Commun. Appl</source>. <volume>20</volume>:<fpage>226</fpage>. <pub-id pub-id-type="doi">10.1145/3656046</pub-id></citation>
</ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shi</surname> <given-names>Q.</given-names></name> <name><surname>Ye</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>Qisampling: an effective sampling strategy for event-based sign language recognition</article-title>. <source>IEEE Signal Process. Lett</source>. <volume>30</volume>, <fpage>768</fpage>&#x02013;<lpage>772</lpage>. <pub-id pub-id-type="doi">10.1109/LSP.2023.3289111</pub-id></citation>
</ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shrestha</surname> <given-names>S. B.</given-names></name> <name><surname>Orchard</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). <article-title>Slayer: spike layer error reassignment in time</article-title>. <source>Adv. Neural Inf. Process. Syst</source>. <volume>31</volume>, <fpage>1412</fpage>&#x02013;<lpage>1421</lpage>.</citation>
</ref>
<ref id="B51">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Smith</surname> <given-names>L. N.</given-names></name> <name><surname>Topin</surname> <given-names>N.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Super-convergence: very fast training of neural networks using large learning rates,&#x0201D;</article-title> in <source>Defense</source> &#x0002B; <italic>Commercial Sensing</italic> (Baltimore, MD). <pub-id pub-id-type="doi">10.1117/12.2520589</pub-id></citation>
</ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stein</surname> <given-names>R.</given-names></name> <name><surname>Hodgkin</surname> <given-names>A. L.</given-names></name></person-group> (<year>1967</year>). <article-title>The frequency of nerve action potentials generated by applied currents</article-title>. <source>Proc. R. Soc. Lond. B. Biol. Sci</source>. <volume>167</volume>, <fpage>64</fpage>&#x02013;<lpage>86</lpage>. <pub-id pub-id-type="doi">10.1098/rspb.1967.0013</pub-id><pub-id pub-id-type="pmid">4382591</pub-id></citation></ref>
<ref id="B53">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>L.</given-names></name> <name><surname>Yang</surname> <given-names>F.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Guo</surname> <given-names>C.</given-names></name> <name><surname>Tong</surname> <given-names>L.</given-names></name> <name><surname>Hu</surname> <given-names>Q.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A survey of robot perception and control based on event camera</article-title>. <source>Acta Autom. Sin</source>, <volume>48</volume>, <fpage>1869</fpage>&#x02013;<lpage>1889</lpage>.</citation>
</ref>
<ref id="B54">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Ge</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Fang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Eventrpg: event data augmentation with relevance propagation guidance</article-title>. <source>arXiv</source> [Preprint]. arXiv:2403.09274. <pub-id pub-id-type="doi">10.48550/arXiv.2403.09274</pub-id></citation>
</ref>
<ref id="B55">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>Z.</given-names></name> <name><surname>Ke</surname> <given-names>Q.</given-names></name> <name><surname>Rahmani</surname> <given-names>H.</given-names></name> <name><surname>Bennamoun</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>G.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Human action recognition from various data modalities: a review</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>45</volume>, <fpage>3200</fpage>&#x02013;<lpage>3225</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3183112</pub-id><pub-id pub-id-type="pmid">35700242</pub-id></citation></ref>
<ref id="B56">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vasudevan</surname> <given-names>A.</given-names></name> <name><surname>Negri</surname> <given-names>P.</given-names></name> <name><surname>Di Ielsi</surname> <given-names>C.</given-names></name> <name><surname>Linares-Barranco</surname> <given-names>B.</given-names></name> <name><surname>Serrano-Gotarredona</surname> <given-names>T.</given-names></name></person-group> (<year>2022</year>). <article-title>Sl-animals-dvs: event-driven sign language animals dataset</article-title>. <source>Pattern Anal. Applic</source>. <volume>25</volume>, <fpage>505</fpage>&#x02013;<lpage>520</lpage>. <pub-id pub-id-type="doi">10.1007/s10044-021-01011-w</pub-id></citation>
</ref>
<ref id="B57">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N. M.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Attention is all you need,&#x0201D;</article-title> in <source>Neural Information Processing Systems</source> (<publisher-loc>Long Beach, CA</publisher-loc>).</citation>
</ref>
<ref id="B58">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>V&#x000E1;zquez-Enr&#x000ED;quez</surname> <given-names>M.</given-names></name> <name><surname>Alba-Castro</surname> <given-names>J. L.</given-names></name> <name><surname>Doc&#x000ED;o-Fern&#x000E1;ndez</surname> <given-names>L.</given-names></name> <name><surname>Rodr&#x000ED;guez-Banga</surname> <given-names>E.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Isolated sign language recognition with multi-scale spatial-temporal graph convolutional networks,&#x0201D;</article-title> in <source>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3457</fpage>&#x02013;<lpage>3466</lpage>. <pub-id pub-id-type="doi">10.1109/CVPRW53098.2021.00385</pub-id></citation>
</ref>
<ref id="B59">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Yan</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>A comprehensive survey of RGB-based and skeleton-based human action recognition</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>53880</fpage>&#x02013;<lpage>53898</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3282311</pub-id></citation>
</ref>
<ref id="B60">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>F.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Yan</surname> <given-names>H.</given-names></name> <name><surname>Han</surname> <given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>TIM-SLR: a lightweight network for video isolated sign language recognition</article-title>. <source>Neural Comput. Appl</source>. <volume>35</volume>, <fpage>22265</fpage>&#x02013;<lpage>22280</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-023-08873-7</pub-id></citation>
</ref>
<ref id="B61">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <name><surname>Gao</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Tang</surname> <given-names>C.</given-names></name> <name><surname>Ogunbona</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Scene flow to action map: a new representation for rgb-d based action recognition with convolutional neural networks,&#x0201D;</article-title> in <source>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Honolulu, HI</publisher-loc>), <fpage>416</fpage>&#x02013;<lpage>425</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2017.52</pub-id></citation>
</ref>
<ref id="B62">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Q.</given-names></name> <name><surname>Luo</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Sun</surname> <given-names>L.</given-names></name> <name><surname>Ma</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Recent advances in pedestrian navigation activity recognition: a review</article-title>. <source>IEEE Sens. J</source>. <volume>22</volume>, <fpage>7499</fpage>&#x02013;<lpage>7518</lpage>. <pub-id pub-id-type="doi">10.1109/JSEN.2022.3153610</pub-id></citation>
</ref>
<ref id="B63">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Q.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Yuan</surname> <given-names>J.</given-names></name> <name><surname>Lu</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Space-time event clouds for gesture recognition: from RGB cameras to event cameras,&#x0201D;</article-title> in <source>2019 IEEE Winter Conference on Applications of Computer Vision (WACV)</source> (<publisher-loc>Waikoloa, HI</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1826</fpage>&#x02013;<lpage>1835</lpage>. <pub-id pub-id-type="doi">10.1109/WACV.2019.00199</pub-id></citation>
</ref>
<ref id="B64">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Shao</surname> <given-names>P.</given-names></name> <name><surname>Jiang</surname> <given-names>B.</given-names></name> <name><surname>Zhu</surname> <given-names>L.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Event stream based human action recognition: a high-definition benchmark dataset and algorithms</article-title>. <source>arXiv</source> [Preprint]. arXiv:2408.09764. <pub-id pub-id-type="doi">10.48550/arXiv.2408.09764</pub-id></citation>
</ref>
<ref id="B65">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Rong</surname> <given-names>Y.</given-names></name> <name><surname>Zhu</surname> <given-names>L.</given-names></name> <name><surname>Jiang</surname> <given-names>B.</given-names></name> <name><surname>Tang</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Sstformer: bridging spiking neural network and memory support transformer for frame-event based recognition</article-title>. <source>arXiv</source> [Preprint]. arXiv:2308.04369. <pub-id pub-id-type="doi">10.48550/arXiv.2308.04369</pub-id></citation>
</ref>
<ref id="B66">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Shen</surname> <given-names>Y.</given-names></name> <name><surname>Du</surname> <given-names>B.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Cui</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Event-stream representation for human gaits identification using deep neural networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>44</volume>, <fpage>3436</fpage>&#x02013;<lpage>3449</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2021.3054886</pub-id><pub-id pub-id-type="pmid">33502972</pub-id></citation></ref>
<ref id="B67">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Fang</surname> <given-names>Y.</given-names></name> <name><surname>Cao</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>Q.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Xu</surname> <given-names>R.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Masked spiking transformer,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Paris</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1761</fpage>&#x02013;<lpage>1771</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV51070.2023.00169</pub-id><pub-id pub-id-type="pmid">37875104</pub-id></citation></ref>
<ref id="B68">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Shi</surname> <given-names>L.</given-names></name></person-group> (<year>2018</year>). <article-title>Spatio-temporal backpropagation for training high-performance spiking neural networks</article-title>. <source>Front. Neurosci</source>. <volume>12</volume>:<fpage>323875</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2018.00331</pub-id><pub-id pub-id-type="pmid">29875621</pub-id></citation></ref>
<ref id="B69">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xiao</surname> <given-names>R.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Ma</surname> <given-names>Y.</given-names></name> <name><surname>Yan</surname> <given-names>R.</given-names></name> <name><surname>Orchard</surname> <given-names>G.</given-names></name></person-group> (<year>2019</year>). <article-title>An event-driven categorization model for aer image sensors using multispike encoding and learning</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <volume>31</volume>, <fpage>3649</fpage>&#x02013;<lpage>3657</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2019.2945630</pub-id><pub-id pub-id-type="pmid">31714243</pub-id></citation></ref>
<ref id="B70">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Qi</surname> <given-names>Y.</given-names></name> <name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>&#x0201C;CSNN: an augmented spiking based framework with perceptron-inception,&#x0201D;</article-title> in <source>IJCAI, Vol. 1646</source> (<publisher-loc>Stockholm</publisher-loc>). <pub-id pub-id-type="doi">10.24963/ijcai.2018/228</pub-id></citation>
</ref>
<ref id="B71">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Gao</surname> <given-names>H.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Temporal-wise attention spiking neural networks for event streams classification,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Montreal, QC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10221</fpage>&#x02013;<lpage>10230</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.01006</pub-id></citation>
</ref>
<ref id="B72">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Hu</surname> <given-names>T.</given-names></name> <name><surname>Xu</surname> <given-names>Y.</given-names></name> <name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024a</year>). <article-title>Spike-driven transformer v2: Meta spiking neural network architecture inspiring the design of next-generation neuromorphic chips</article-title>. <source>arXiv</source> [Preprint]. arXiv:2404.03663. <pub-id pub-id-type="doi">10.48550/arXiv.2404.03663</pub-id></citation>
</ref>
<ref id="B73">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Inherent redundancy in spiking neural networks,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF international conference on computer vision</source> (<publisher-loc>Paris</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>16924</fpage>&#x02013;<lpage>16934</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV51070.2023.01552</pub-id><pub-id pub-id-type="pmid">39283040</pub-id></citation></ref>
<ref id="B74">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Yuan</surname> <given-names>L.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2024b</year>). <article-title>Spike-driven transformer</article-title>. <source>Adv. Neural Inf. Process. Syst</source>. <volume>36</volume>, <fpage>64043</fpage>&#x02013;<lpage>64058</lpage>.</citation>
</ref>
<ref id="B75">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>L.</given-names></name> <name><surname>Huang</surname> <given-names>L.</given-names></name> <name><surname>Zhou</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Ma</surname> <given-names>Z.</given-names></name> <name><surname>Zhou</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Svformer: a direct training spiking transformer for efficient video action recognition</article-title>. <source>arXiv</source> [Preprint]. arXiv:2406.15034. <pub-id pub-id-type="doi">10.48550/arXiv.2406.15034</pub-id></citation>
</ref>
<ref id="B76">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Taha</surname> <given-names>A.</given-names></name> <name><surname>Taylor</surname> <given-names>W.</given-names></name> <name><surname>Zahid</surname> <given-names>A.</given-names></name> <name><surname>Rajab</surname> <given-names>K.</given-names></name> <name><surname>Heidari</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A radar-based human activity recognition using a novel 3-d point cloud classifier</article-title>. <source>IEEE Sens. J</source>. <volume>22</volume>, <fpage>18218</fpage>&#x02013;<lpage>18227</lpage>. <pub-id pub-id-type="doi">10.1109/JSEN.2022.3198395</pub-id></citation>
</ref>
<ref id="B77">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Huo</surname> <given-names>D.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Qian</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>Q.</given-names></name> <name><surname>Pan</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2023</year>). &#x0201C;22.6 anp-i: a 28nm 1.5 pj/sop asynchronous spiking neural network processor enabling sub-o. 1 &#x003BC;j/sample on-chip learning for edge-ai applications,&#x0201D; <italic>2023 IEEE International Solid-State Circuits Conference (ISSCC)</italic> (San Francisco, CA: IEEE), <fpage>21</fpage>&#x02013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1109/ISSCC42615.2023.10067650</pub-id></citation>
</ref>
<ref id="B78">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Liang</surname> <given-names>M.</given-names></name> <name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Wei</surname> <given-names>S.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;A 28nm configurable asynchronous snn accelerator with energy-efficient learning,&#x0201D;</article-title> in <source>2021 27th IEEE International Symposium on Asynchronous Circuits and Systems (ASYNC)</source> (<publisher-loc>Beijing</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>34</fpage>&#x02013;<lpage>39</lpage>. <pub-id pub-id-type="doi">10.1109/ASYNC48570.2021.00013</pub-id></citation>
</ref>
<ref id="B79">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>He</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Yan</surname> <given-names>S.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Spikformer: when spiking neural network meets transformer</article-title>. <source>arXiv</source> [Preprint]. arXiv:2209.15425. <pub-id pub-id-type="doi">10.48550/arXiv.2209.15425</pub-id></citation>
</ref>
<ref id="B80">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>A. Z.</given-names></name> <name><surname>Yuan</surname> <given-names>L.</given-names></name> <name><surname>Chaney</surname> <given-names>K.</given-names></name> <name><surname>Daniilidis</surname> <given-names>K.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Unsupervised event-based learning of optical flow, depth, and egomotion,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>989</fpage>&#x02013;<lpage>997</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2019.00108</pub-id><pub-id pub-id-type="pmid">38696288</pub-id></citation></ref>
<ref id="B81">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname> <given-names>S.</given-names></name> <name><surname>Mu</surname> <given-names>Y.</given-names></name> <name><surname>Zuo</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>C.</given-names></name></person-group> (<year>2023</year>). <article-title>Event-based human pose tracking by spiking spatiotemporal transformer</article-title>. <source>arXiv</source> [Preprint]. arXiv:2303.09681. <pub-id pub-id-type="doi">10.48550/arXiv.2303.09681</pub-id></citation>
</ref>
</ref-list>
</back>
</article>