<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2026.1774551</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A spatio-temporal attention enhanced CNN method for marker localization in AUV docking</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Xing</surname><given-names>Runfa</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3325461/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhang</surname><given-names>Lichuan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Huang</surname><given-names>Bing</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Han</surname><given-names>Guangyao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Liu</surname><given-names>Lu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Marine Science and Technology, Northwestern Polytechnical University</institution>, <city>Xi&#x2018;an</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Shenzhen Research Institute of Northwestern Polytechnical University</institution>, <city>Shenzhen</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Lichuan Zhang, <email xlink:href="mailto:zlc@nwpu.edu.cn">zlc@nwpu.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-06">
<day>06</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1774551</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>15</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Xing, Zhang, Huang, Han and Liu.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Xing, Zhang, Huang, Han and Liu</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-06">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Underwater docking of autonomous underwater vehicles (AUVs) was typically dependent on the complete visual detection of markers. When markers were only partially visible due to occlusion or departure from the field of view, conventional localization methods based on complete features were rendered ineffective, resulting in the interruption of docking operations. To address this limitation, an enhanced orientation-aware method based on a spatiotemporal attention convolutional neural network (CNN) was proposed in this study. The core of this method was a dual-path feature fusion architecture: discriminative features of visible marker segments were extracted from single frames by the spatial path, while the temporal path was employed to aggregate features across consecutive frames, thereby compensating for the insufficiency of single-frame information. These two pathways were adaptively fused through a spatiotemporal attention module, which was designed to dynamically focus on the most informative cues. Consequently, robust qualitative judgment of the marker&#x2019;s relative orientation was achieved. Experimental validation conducted in underwater environments demonstrated that stable orientation awareness was maintained by the proposed method even under conditions where the marker was severely off-center or largely obscured. This approach was shown to significantly extend the initial capture range for AUV docking guidance, and the robustness and operational continuity of the system under extreme visual conditions were effectively enhanced.</p>
</abstract>
<kwd-group>
<kwd>AUV</kwd>
<kwd>CNN</kwd>
<kwd>docking</kwd>
<kwd>spatio-temporal attention</kwd>
<kwd>visual localization</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. Local Science and Technology Special foundation under the guidance of the Central Government of Shenzhen (JCYJ20210324122406019). Smart Eye Action Fund (Grant Number: 62602010321). National Key Research and Development Program of China (Grant Number: 2022YFC2805200).</funding-statement>
</funding-group>
<counts>
<fig-count count="12"/>
<table-count count="8"/>
<equation-count count="7"/>
<ref-count count="34"/>
<page-count count="14"/>
<word-count count="7198"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Ocean Observation</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<sec id="s1_1">
<label>1.1</label>
<title>Background</title>
<p>Autonomous Underwater Vehicles (AUVs) play a crucial role in underwater exploration and search missions (<xref ref-type="bibr" rid="B22">Teng et&#xa0;al., 2020</xref>) (<xref ref-type="bibr" rid="B17">Ma et&#xa0;al., 2023</xref>). Upon completion of tasks, they are required to dock with a recovery device underwater for recharging and data uploading. The success rate and efficiency of docking depend significantly on the accuracy of relative pose estimation between the AUV and the recovery device. Acoustic localization is suitable for long-range positioning (<xref ref-type="bibr" rid="B19">Pearson et&#xa0;al., 2014</xref>); however, its errors increase substantially at close range, making it inadequate for high-precision docking requirements (<xref ref-type="bibr" rid="B4">Gu et&#xa0;al., 2022</xref>). To overcome the limitations of acoustic localization in near-field scenarios, visual localization has become an essential approach for achieving high-precision underwater docking of AUVs. Despite challenges such as poor underwater lighting conditions and the small size of AUVs (<xref ref-type="bibr" rid="B20">Peng et&#xa0;al., 2019</xref>), visual localization systems&#x2014;often implemented using markers and monocular cameras&#x2014;remain an effective solution for close-range pose estimation.</p>
<p>The accuracy of visual localization is directly correlated with the number of feature points acquired from the observed markers. Within the effective visual region, the predefined set of feature points can be fully extracted by the system, enabling high-precision localization. However, when the observation position exceeds the effective region, only partial feature information can be obtained (<xref ref-type="bibr" rid="B16">Lu et&#xa0;al., 2018</xref>). Under such circumstances, the core challenge of visual localization lies in how to maintain localization under incomplete observations so that the carrier can re-enter the effective visual region. This challenge imposes stringent requirements on the robustness of the system and significantly restricts the applicability of visual localization technology. In underwater visual localization, situations where the target moves beyond the effective visual area are common (<xref ref-type="bibr" rid="B10">Jin et&#xa0;al., 2018</xref>). Existing systems typically utilize local visual information and attempt to return to the effective area through positional adjustments, which enhances robustness to some extent. However, relying solely on limited local visual information makes accurate localization difficult to achieve (<xref ref-type="bibr" rid="B16">Lu et&#xa0;al., 2018</xref>). Therefore, how to fully leverage such limited information remains a&#xa0;key&#xa0;challenge. This implies that, in addition to spatial features,&#xa0;temporal sequence features must be further exploited&#xa0;to&#xa0;compensate for the insufficiency of local spatial information,&#xa0;thereby enabling more precise localization under incomplete observations.</p>
</sec>
<sec id="s1_2">
<label>1.2</label>
<title>Related work</title>
<p>In the study of observation-guided positioning based on partial visual information, existing methods still face numerous challenges when marker information is incomplete. Xu (<xref ref-type="bibr" rid="B27">Xu et&#xa0;al., 2021</xref>) employed QR codes as markers, and the localization process relies heavily on intact image information; once the QR code is damaged, effective positioning cannot be achieved. Although the matrix-based deployment of QR codes can partially mitigate this issue, it still fails to fundamentally address the problem of information loss. Jyothi (<xref ref-type="bibr" rid="B11">Jyothi et&#xa0;al., 2023</xref>) used LED point light sources as markers, requiring only three key points to achieve visual localization without relying on complete marker features. However, its performance is constrained by the camera&#x2019;s vertical field of view, resulting in significantly insufficient perception in the vertical direction. Zhang (<xref ref-type="bibr" rid="B18">Negahdaripour et&#xa0;al., 2001</xref>) attempted to expand the vertical field of view using a panoramic camera, but the acquired images suffer from severe distortion, leading to substantial localization errors that fail to meet the requirements for high-precision underwater docking. In contrast, the line light source marker method proposed by Xing (<xref ref-type="bibr" rid="B24">Xing et&#xa0;al., 2025a</xref>) demonstrates better adaptability in partial visual localization. Line light sources contain richer structural information than point light sources and can expand the visual perception range through line fitting. Although this method is also constrained by the vertical field of view, its advantage lies in the fact that even when part of the marker lies outside the effective visual area, residual line segment features can still be captured in both horizontal and vertical directions. Although these segments contain fewer key points than the minimum number required for PnP solving (three) and cannot be directly used for pose estimation, they still contain valuable relative orientation information. Compared to point light sources, line light sources can provide richer spatial structural information under partially visible conditions. In contrast to QR codes, their recognition process demonstrates stronger robustness against occlusion or information loss. Consequently, in practical scenarios where only partial information is available, line light source markers offer a feasible and advantageous solution for achieving stable and efficient positioning. Effectively extracting such incomplete features is key to achieving classification.</p>
<p>In the field of image feature extraction, convolutional neural networks (CNNs) and their extended algorithms have been widely applied (<xref ref-type="bibr" rid="B34">Zilu et&#xa0;al., 2020</xref>) (<xref ref-type="bibr" rid="B7">He et&#xa0;al., 2018</xref>). Han (<xref ref-type="bibr" rid="B32">Zhao et&#xa0;al., 2019</xref>) employed a CNN for feature extraction from underwater object images, achieving high-precision object recognition and classification. However, this method relies solely on single-frame images and lacks modeling of temporal features, which limits its adaptability in dynamic scenarios. In contrast, Zhao (<xref ref-type="bibr" rid="B5">Han et&#xa0;al., 2020</xref>) utilized a 3D-CNN to model spatial cloud maps, extracting spatiotemporal features directly from image sequences through spatial fusion of multiple images and accomplishing change prediction. Nevertheless, the feature fusion in this approach remains primarily spatial, with relatively limited capability in temporal feature extraction (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2025</xref>). For video sequences captured by autonomous underwater vehicles (AUVs) in motion, the spatial and temporal relationships between images are of significant importance for visual localization, and their weights need to be dynamically adjusted according to spatiotemporal characteristics (<xref ref-type="bibr" rid="B28">Xu et&#xa0;al., 2024</xref>) (<xref ref-type="bibr" rid="B1">Capone et&#xa0;al., 2025</xref>). The self-attention mechanism is capable of autonomously capturing key features in data through training and modeling dependencies among multiple image frames (<xref ref-type="bibr" rid="B23">Vaswani et&#xa0;al., 2017</xref>). Chen (<xref ref-type="bibr" rid="B2">Chen et&#xa0;al., 2021</xref>) leveraged a multi-head attention mechanism to extract inter-frame features of consecutive images from multiple dimensions, significantly improving classification accuracy compared to single-frame images. However, this method does not sufficiently account for the effect of temporal decay on consecutive frames. In practical applications, the contribution of image frames at different time points to the judgment of the current state may vary (<xref ref-type="bibr" rid="B6">He et&#xa0;al., 2023</xref>). Therefore, it is necessary to dynamically adjust the weights of images in consecutive frames to more accurately reflect temporal dependencies. Dhiman (<xref ref-type="bibr" rid="B3">Dhiman et&#xa0;al., 2021</xref>) combined a convolutional neural network with a spatiotemporal attention mechanism, effectively achieving feature extraction and state judgment of human poses. This demonstrates that the spatiotemporal attention mechanism can effectively capture spatiotemporal dependencies in continuous motion, making it suitable for state detection and classification of dynamic behaviors. In underwater visual localization applications, this mechanism can be adopted to classify visual localization outcomes. By dynamically focusing on key frames and regions through spatiotemporal attention, the robustness of visual localization in complex underwater environments can be enhanced.</p>
</sec>
<sec id="s1_3">
<label>1.3</label>
<title>Contributions</title>
<p>This study focuses on the feature extraction and classification of local visual information outside the effective visual field, resulting in spatially oriented localization outcomes and enhancing the reliability of the AUV visual localization system. The main innovations are as follows.</p>
<list list-type="order">
<list-item>
<p>To address the observation challenges of linear light source landmarks under partial visual conditions (<xref ref-type="bibr" rid="B21">Peng et&#xa0;al., 2020</xref>), their characteristic manifestations in incomplete visibility states were systematically analyzed.</p></list-item>
<list-item>
<p>To enhance feature extraction (<xref ref-type="bibr" rid="B15">Liu et&#xa0;al., 2024</xref>), a CNN-based method integrated with a spatiotemporal attention mechanism was developed, effectively combining spatial feature extraction with inter-frame temporal information capture.</p></list-item>
<list-item>
<p>To mitigate the sparsity issue in temporal features (<xref ref-type="bibr" rid="B29">Xu et&#xa0;al., 2020</xref>), a temporal decay mechanism combined with a multi-layer attention framework was introduced. This strategy preserves the discriminative power of single-frame features while enhancing the modeling capability for long-term dependencies.</p></list-item>
</list>
</sec>
<sec id="s1_4">
<label>1.4</label>
<title>Organization</title>
<p>The structure of this paper is as follows: Section 2: Classification of Line Light Source Markers and Their Observation Results under Local Vision. In this section, the spatial feature extraction of line light source markers using convolutional neural networks is elaborated upon. Furthermore, the dynamic weight adjustment and fusion for continuous images, achieved through spatio-temporal attention, are discussed. Section 3: Real-world Observation Results of Line Light Source Markers under Local Vision. The design and execution of an AUV local visual localization experiment are described in this section. Section 4: Conclusion and Future Work. The contributions of this study are summarized, and future research directions are outlined.</p>
</sec>
</sec>
<sec id="s2">
<label>2</label>
<title>Methodology</title>
<p>To achieve visual localization and acquire the relative spatial orientation with respect to the effective visual area, an algorithmic framework is proposed, as shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>. Firstly, the composition of local vision and the observed results are described. Among these, a classified analysis is conducted on the potential situations where line light source markers may appear outside the effective visual area. Furthermore, classification is performed for the spatial orientation of the effective visual area. Spatial feature extraction is implemented based on convolutional neural networks, where residual block structures are utilized to extract features from each image. A single image contains limited effective features for visual localization; therefore, spatiotemporal features from multiple consecutive images are required to be utilized. Spatiotemporal attention fusion, through temporal decay and multi-head attention mechanisms, achieves dynamic weight allocation for multiple images and cross-frame information fusion. Through this process, effective features contained within the local visual information are comprehensively extracted.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Visual localization under partial observability algorithm process.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g001.tif">
<alt-text content-type="machine-generated">Diagram illustrating &#x201c;Visual Guidance for AUVs under Partial Observability&#x201d; with two main sections. The left section shows &#x201c;Line light sources marker&#x201d; with images of lights forming shapes and a table showing schematic diagrams with straight lines and key points. The right section covers &#x201c;Visual Guidance&#x201d; processes: (1) CNN feature extractor, (2) Temporal decay attention with a graph, (3) Multi-head attention with formulas, and (4) Spatiotemporal classifier with a 3D axis diagram and flowchart detailing projection and classification.</alt-text>
</graphic></fig>
<sec id="s2_1">
<label>2.1</label>
<title>Marker design and spatial division</title>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Line light sources marker model</title>
<p>Line light sources are chosen as markers due to their balance between information content and practicality. Point sources offer single features that are lost if occluded, while planar markers are complex and unstable underwater. In contrast, lines provide stronger visual robustness: even when partially visible, a single line conveys structural information like direction and length, and multiple lines form rich geometric relations. Thus, using line light sources overcomes the fragility of points while avoiding the high complexity of planes, offering an effective solution for reliable underwater visual location.</p>
<p>Within the effective visual region, the line light source marker is constituted by four line light sources, and four key points are included, as depicted in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>. In the figure, the line light sources are represented by green lines, and points <italic>A</italic>, <italic>B</italic>, <italic>C</italic>, and <italic>D</italic> are identified as the intersection points of these line light sources (key points) (<xref ref-type="bibr" rid="B26">Xing et&#xa0;al., 2024</xref>). Line light source markers located outside the effective field of view are presented in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. In <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3A</bold></xref>, a single line light source is observed, and no key points are present. In <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3B</bold></xref>, two line light sources are included, along with one key point. In <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3C</bold></xref>, three line light sources are included, accompanied by two key points.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Within the effective visual field.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g002.tif">
<alt-text content-type="machine-generated">A geometric diagram showing a rhombus with vertices labeled A, B, C, and D at each corner. Green lines extend from each vertex, meeting at points outside the rhombus.</alt-text>
</graphic></fig>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Beyond the effective visual field. <bold>(A)</bold> Single line. <bold>(B)</bold> Double lines. <bold>(C)</bold> Triple lines.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g003.tif">
<alt-text content-type="machine-generated">Three geometric figures labeled A, B, and C. A shows two intersecting lines. B shows two lines intersecting at a red dot. C shows two connected lines forming a zigzag with red dots at the connection points.</alt-text>
</graphic></fig>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Spatial division strategy</title>
<p>Beyond the effective visual range, incomplete visual information reduces geometric localization accuracy. By analyzing typical patterns of missing information from different directions and classifying them into categories like &#x201c;left-deviation&#x201d; or &#x201c;right-deviation,&#x201d; preliminary heading alignment can be achieved. This essentially transforms continuous localization into a discrete pattern classification task, reducing reliance on complete environmental perception. The core idea is to leverage statistical correlations between partially visible features and directional deviations, allowing the system to infer approximate orientation through pattern matching even with extensive information loss. This classification-based localization enhances system robustness under weak observation conditions, providing reliable initial constraints for subsequent fine positioning.</p>
<p><italic>x</italic>, <italic>y</italic>, and <italic>z</italic> represent three observation directions in the local visual field. In underwater docking tasks, priority is typically given to ensuring that the heading of the AUV is aligned with the z-axis direction of the visual marker, with the z-axis of the camera being parallel to the AUV&#x2019;s heading (<xref ref-type="bibr" rid="B12">Li et&#xa0;al., 2015</xref>). Consequently, when considering only the local observation results in the <italic>x</italic> and <italic>y</italic> directions, a schematic diagram is presented in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>. In this figure, the boundaries <italic>x</italic> and <italic>y</italic> of the effective visual region are extended linearly, thereby dividing the surrounding space into eight directional subspaces. The initial subspace is defined as the region above the effective visual area, and the subspace indices are encoded from 1 to 8 in a clockwise direction. The effective visual region is depicted as the gray area in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>. This region is defined by its bottom-left coordinate (&#x2212;<italic>x<sub>v</sub></italic>, &#x2212;<italic>y<sub>v</sub></italic>) and top-right coordinate (<italic>x<sub>v</sub>, y<sub>v</sub></italic>). Outside the effective visual region, the area from which line light source information can be observed is bounded by the bottom-left coordinate (&#x2212;<italic>x<sub>n</sub></italic>, &#x2212;<italic>y<sub>n</sub></italic>) and the top-right coordinate (<italic>x<sub>n</sub>, y<sub>n</sub></italic>). The coordinate regions and their corresponding direction mappings for the subspace are presented in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Subspace distribution schematic diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g004.tif">
<alt-text content-type="machine-generated">Illustration of a three-dimensional coordinate system with black axes labeled x, y, and z extending from a central point, intersecting a shaded square grid with red lines.</alt-text>
</graphic></fig>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Subspace mapping correspondence.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Index</th>
<th valign="middle" align="center">Relative direction</th>
<th valign="middle" align="center">Lower-left</th>
<th valign="middle" align="center">Upper-right</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">Upper</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>v</sub>, y<sub>v</sub></italic>)</td>
<td valign="middle" align="center">(<italic>x<sub>v</sub>, y<sub>n</sub></italic>)</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">Upper-Right</td>
<td valign="middle" align="center">(<italic>x<sub>v</sub>, y<sub>v</sub></italic>)</td>
<td valign="middle" align="center">(<italic>x<sub>n</sub>, y<sub>n</sub></italic>)</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="middle" align="center">Right</td>
<td valign="middle" align="center">(<italic>x<sub>v</sub></italic>, &#x2212;<italic>y<sub>v</sub></italic>)</td>
<td valign="middle" align="center">(<italic>x<sub>n</sub>, y<sub>v</sub></italic>)</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">Lower-Right</td>
<td valign="middle" align="center">(<italic>x<sub>v</sub></italic>, &#x2212;<italic>y<sub>n</sub></italic>)</td>
<td valign="middle" align="center">(<italic>x<sub>n</sub></italic>, &#x2212;<italic>y<sub>v</sub></italic>)</td>
</tr>
<tr>
<td valign="middle" align="center">5</td>
<td valign="middle" align="center">Lower</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>v</sub></italic>, &#x2212;<italic>y<sub>n</sub></italic>)</td>
<td valign="middle" align="center">(<italic>x<sub>v</sub></italic>, &#x2212;<italic>y<sub>v</sub></italic>)</td>
</tr>
<tr>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">Lower-Left</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>n</sub></italic>, &#x2212;<italic>y<sub>n</sub></italic>)</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>v</sub></italic>, &#x2212;<italic>y<sub>v</sub></italic>)</td>
</tr>
<tr>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">Left</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>n</sub></italic>, &#x2212;<italic>y<sub>v</sub></italic>)</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>v</sub>, y<sub>v</sub></italic>)</td>
</tr>
<tr>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">Upper-Left</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>n</sub>, y<sub>v</sub></italic>)</td>
<td valign="middle" align="center">(&#x2212;<italic>x<sub>v</sub>, y<sub>n</sub></italic>)</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Spatial feature extraction</title>
<p>The convolutional operation effectively extracts spatial local features from single-frame images through local receptive fields and weight-sharing mechanisms. The feature representation is further enhanced by nonlinear activation functions, thereby achieving focused feature extraction and representation enhancement in key image regions (<xref ref-type="bibr" rid="B31">Yang et&#xa0;al., 2022</xref>) (<xref ref-type="bibr" rid="B33">Zhu et&#xa0;al., 2023</xref>). The spatial feature extraction process of the linear light source marker is illustrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, which includes the following steps: image input, convolution operation, pooling operation (spatial information compression), and hierarchical feature abstraction.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Spatial feature extraction flowchart.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g005.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a convolutional neural network process. An input image grid with green lines is followed by filter matrices, then convolution layers represented by mathematical notation. Pooling layers reduce dimensions with a two-by-two grid. Dimensional expansion involves flattening data into a vector, shown with a formula and multilayer network diagram. Descriptive labels indicate each stage.</alt-text>
</graphic></fig>
<p>The input image is <italic>I</italic> &#x2208; <italic>R<sup>H</sup></italic><sup>&#xd7;</sup><italic><sup>W</sup></italic><sup>&#xd7;</sup><italic><sup>C</sup></italic> Where <italic>H</italic>, <italic>W</italic>, and <italic>C</italic> denote the height, width, and number of channels respectively. The feature maps produced by the <italic>l</italic> &#x2212; <italic>th</italic> convolutional layer is <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msup><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mi>W</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mi>D</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> as illustrated in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>.</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:munderover><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:munderover></mml:mstyle><mml:msubsup><mml:mi>W</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>&#xb7;</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mo>+</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>+</mml:mo><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msubsup><mml:mi>b</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msup><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> represents the convolutional kernel weights (employing an <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> kernel dimension); <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msubsup><mml:mi>b</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mtext>&#xa0;&#xa0;&#xa0;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> is the bias term; <italic>&#x3c3;</italic>(&#xb7;) is Relu activation function.</p>
<p>The single-frame features <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msup><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>L</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> extracted by the CNN serve as input to the temporal attention module, with their spatial information being propagated as demonstrated in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>.</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>F</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>L</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>N</italic> denotes the number of frames, the sequence dimension of the spatiotemporal attention mechanism spans <italic>N</italic> units.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Spatio-temporal attention enhancement</title>
<p>Identifying linear light landmarks requires robust feature extraction in dynamic environments. Due to limited field of view, spatial information from a single frame is often insufficient. To address this, spatiotemporal features from consecutive images are integrated to enhance localization robustness. Spatial features capture geometric properties, while temporal features describe motion patterns. After extracting spatial features from a single frame, temporal information is obtained by analyzing multiple frames over time. Fusing these features expands information beyond the immediate visible area, enabling rough localization even when targets are partially obscured or outside the direct field of view, thus improving system adaptability in challenging conditions (<xref ref-type="bibr" rid="B30">Yan et&#xa0;al., 2023</xref>).</p>
<p>Due to the inertia of the AUV&#x2019;s motion, the image sequences captured by the onboard camera exhibit temporal coherence. However, this coherence gradually diminishes over time. The time decay parameter serves as a key descriptor of this weakening coherence, with the primary challenge lying in quantifying the decay and distinguishing normal motion from environmental disturbances. It is precisely in response to this challenge that this paper introduces a spatiotemporal attention mechanism. The feature discrepancy and temporal decay in spatiotemporal attention are synergistically integrated with the self-attention mechanism, enabling dynamic weight adjustment and fusion of sequential images (<xref ref-type="bibr" rid="B9">Hsu et&#xa0;al., 2023</xref>). As illustrated in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>, the processing pipeline for continuous linear light source marker images consists of temporal decay attention, multi-level attention, and a spatiotemporal classifier. Sequential information is processed using a hierarchical self-attention mechanism with a forgetting mechanism.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Spatio-temporal attention flowchart.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g006.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a process starting with &#x201c;Input: N consecutive frames&#x201d; leading to &#x201c;CNN-Spatial feature extraction,&#x201d; &#x201c;Temporal positional encoding + Decay coefficients,&#x201d; and &#x201c;Transformer Encoder.&#x201d; A side path details &#x201c;Temporally-decayed multi-head attention mechanism,&#x201d; leading to &#x201c;Q,K,V Update,&#x201d; &#x201c;Computing the temporally-weighted attention matrix,&#x201d; &#x201c;Refining the attention scoring mechanism,&#x201d; and &#x201c;Amalgamation of multi-head attention mechanisms.&#x201d; The main path concludes with &#x201c;Inter-frame feature synthesis&#x201d; and &#x201c;Output: Classification yield."</alt-text>
</graphic></fig>
<p>The time-decaying self-attention mechanism is designed by simultaneously considering pixel-level feature differences and temporal decay effects. Given an input sequence <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> (where B denotes the number of frames and C represents the feature dimension), the standard self-attention mechanism is formulated as shown in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>.</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mtext>&#xa0;max</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mi>K</mml:mi><mml:mi>T</mml:mi></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac><mml:mo stretchy="false">)</mml:mo><mml:mi>V</mml:mi></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>Q</italic> = <italic>XW<sub>Q</sub>, K</italic> = <italic>XW<sub>K</sub>, V</italic> = <italic>XW<sub>V</sub></italic> is linear projection; <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mi>K</mml:mi><mml:mi>T</mml:mi></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>K</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow></mml:math></inline-formula> manifest the inter-frame feature discrepancies; <italic>d<sub>k</sub></italic> denotes the dimensionality of the key vectors.</p>
<p>Temporal decay causes the model to focus more on recent frames while gradually forgetting distant ones. The decay coefficient is a learnable parameter, constrained to be positive through the softplus function. The temporal decay coefficient <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:mi>&#x3bb;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mi>h</mml:mi></mml:msup></mml:mrow></mml:math></inline-formula> (where h denotes the number of attention heads) adjusts the attention scores as shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>.</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>q</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msubsup><mml:mi>k</mml:mi><mml:mi>j</mml:mi><mml:mi>T</mml:mi></mml:msubsup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3bb;</mml:mi><mml:mo>&#xb7;</mml:mo><mml:mo>|</mml:mo><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>t</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>t<sub>i</sub></italic> represents the timestamp of the i-th frame. <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:mi>&#x3bb;</mml:mi><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mi>&#x3bb;</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xa0;</mml:mo></mml:mrow></mml:math></inline-formula> ensures that the attenuation coefficient remains non-negative. |<italic>t<sub>i</sub></italic>&#xa0;&#x2212;&#xa0;<italic>t<sub>j</sub></italic>| is defined as the absolute value of the temporal distance.</p>
<p>Different attention layers capture patterns at varying temporal scales. The shallow layers may focus on local temporal patterns, while the deeper layers may capture global dependencies (<xref ref-type="bibr" rid="B14">Li et&#xa0;al., 2021</xref>). For the <italic>m</italic> &#x2212; <italic>th</italic> attention head (out of h heads), its output is given by <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>.</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mtext>&#xa0;max</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy="false">)</mml:mo><mml:msup><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msubsup><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> is computed according to <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>, with each head possessing an independent <italic>&#x3bb;</italic><sup>(</sup><italic><sup>m</sup></italic><sup>)</sup>.</p>
<p>The final multi-head attention results are concatenated and linearly projected, as shown in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mi>M</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>H</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>O</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>Multi-head attention adaptively adjusts temporal weights through learnable decay coefficients <italic>&#x3bb;</italic>, enabling the model to dynamically focus on patterns across different time scales and integrate information from all frames into an enhanced representation of the final frame. The spatiotemporal classifier combines CNN&#x2019;s local feature extraction with Transformer&#x2019;s global dependency modeling, performing classification based on the fused features of the last frame. The pseudocode of the algorithm is as follows <xref ref-type="statement" rid="st1"><bold>Algorithm 1</bold></xref>.</p>
<statement content-type="algorithm" id="st1">
<label>Algorithm 1</label>
<p><graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-sg001.tif"/></p>
</statement>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and discussion</title>
<sec id="s3_1">
<label>3.1</label>
<title>Observation results of docking markers</title>
<p>The linear light source marker was designed as shown in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7A</bold></xref>. Four straight PVC tubes, each with a diameter of 10 mm and a length of 1 m, were used. Fluorescent material was uniformly coated on the PVC surface to achieve autonomous luminescence in low-light conditions, as illustrated in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7B</bold></xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Line light sources marker. <bold>(A)</bold> High-light. <bold>(B)</bold> Low-light.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g007.tif">
<alt-text content-type="machine-generated">Panel A shows a setup with light-colored rods crossed and mounted vertically, and a red cylindrical object in the background. Panel B displays the same setup in a dark environment with the rods glowing green.</alt-text>
</graphic></fig>
<p>The small fully actuated AUV is shown in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8A</bold></xref>. A front-view camera is mounted on the head for underwater image acquisition, with a resolution of 1920 &#xd7; 1080 and a USB communication protocol. The underwater environment consists of an indoor pool with dimensions of 20 m (length) &#xd7; 10 m (width)&#xa0;&#xd7; 5 m (depth), as illustrated in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8B</bold></xref>. The inertial navigation unit of the AUV provides its relative position within the pool, which can be used as the ground truth for subspace localization (<xref ref-type="bibr" rid="B25">Xing et&#xa0;al., 2025b</xref>).</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Experimental apparatus and environment. <bold>(A)</bold> AUV. <bold>(B)</bold> Pool. <bold>(C)</bold> Marker.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g008.tif">
<alt-text content-type="machine-generated">A. A cylindrical device submerged in a pool, attached to a cable. B. The surface of a water tank with reflections from a skylight. C. A yellow and white frame structure submerged in water, forming a geometric pattern.</alt-text>
</graphic></fig>
<p>A linear light source marker is fixed in the pool, as depicted in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8C</bold></xref>. The position of the marker can be adjusted via a telescopic mounting mechanism. Based on <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>, the <italic>z</italic> direction was set to 1 m, while the <italic>x</italic> and <italic>y</italic> directions were locally observed according to the sequence of subspaces 1-8. Typical observation results are presented in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>. As shown in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>, variations in the number of straight lines and key points are observed across the eight subspaces for the linear light source. The fiducial markers of the linear light source exhibit distinct spatial characteristics in different subspaces, meeting the diverse requirements for localized visual localization.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Subspace imagery.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Index</th>
<th valign="middle" align="center">Images</th>
<th valign="middle" align="center">Index</th>
<th valign="middle" align="center">Images</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i001.tif"><alt-text content-type="machine-generated">Close-up image shows intersecting green and yellow rods against a dark background with a faint red line on the left edge. The scene is dimly lit and abstract.</alt-text></inline-graphic></td>
<td valign="middle" align="center">5</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i005.tif"><alt-text content-type="machine-generated">A single thin green line crosses diagonally from lower left to upper right against a solid black background. No additional objects or patterns are present.</alt-text></inline-graphic></td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i002.tif"><alt-text content-type="machine-generated">Two intersecting green lines form an X shape against a solid black background, with each line diagonally spanning from corner to corner. Lighting appears even and the image is closely cropped.</alt-text></inline-graphic></td>
<td valign="middle" align="center">6</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i006.tif"><alt-text content-type="machine-generated">Dark image featuring a faint green diagonal streak of light against a mostly black background, with a lower portion shaded in blue, suggesting a blurred or abstract scene.</alt-text></inline-graphic></td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i003.tif"><alt-text content-type="machine-generated">Two light-colored rods intersect at an angle against a dark background, with a thin red line at the top and a vertical yellow stripe partially visible behind the rods.</alt-text></inline-graphic></td>
<td valign="middle" align="center">7</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i007.tif"><alt-text content-type="machine-generated">Diagonal greenish line crosses a dark background, with a thin horizontal red border at the top edge. No recognizable objects or text are present.</alt-text></inline-graphic></td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i004.tif"><alt-text content-type="machine-generated">Close-up of two green rods crossing to form an X shape against a dark background, possibly part of a fence or grid structure.</alt-text></inline-graphic></td>
<td valign="middle" align="center">8</td>
<td valign="top" align="center"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-i008.tif"><alt-text content-type="machine-generated">A single thin green rod appears diagonally across a dark, nearly black background with no other visible features or objects present.</alt-text></inline-graphic></td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Docking marker localization</title>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Model training</title>
<p>To validate the effectiveness of the proposed model, a local visual localization dataset was constructed in a controlled indoor water tank environment, as illustrated in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8B</bold></xref>. The dataset was collected using an AUV equipped with a forward-looking camera, shown in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8A</bold></xref>. The visual system captures images at a frame rate of 20 fps, acquiring 20 consecutive frames per second. Simultaneously, marker localization outputs positioning results at a frequency of 1 Hz. As a result, each sequence corresponds to a complete visual positioning cycle, consisting of 20 frames. During the AUV docking process, the typical navigation speed does not exceed 4 knots, approximately 2 m/s. At this speed, the AUV moves about 2 meters within one second, and the spatial sampling interval between adjacent frames captured by the camera is approximately 0.1 meters. Based on the practical requirements of visual positioning, the distance levels are divided into 10 categories. The design of the 20-frame sequence aims to fully utilize the multi-frame information within a single cycle for offline training, enabling the extraction of richer features. Within each subspace, the image acquisition strategy is governed by two dimensions, as expressed in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>.</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left" equalrows="true" equalcolumns="true"><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:mtext>&#x394;</mml:mtext><mml:mi>x</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x2264;</mml:mo><mml:mi>x</mml:mi><mml:mo>&#x2264;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:mtext>&#x394;</mml:mtext><mml:mi>y</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x2264;</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x2264;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mi>z</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:mtext>&#x394;</mml:mtext><mml:mi>z</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x2264;</mml:mo><mml:mi>z</mml:mi><mml:mo>&#x2264;</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>x, y, z</italic> represents the position coordinates of the AUV&#x2019;s forward-looking camera. <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> denotes the coordinates of the lower-left corner of the subspace, while <italic>x<sub>end</sub>, y<sub>end</sub></italic> corresponds to the coordinates of the upper-right corner.</p>
<p>For the horizontal direction, a lateral grid sampling method is employed, with the acquisition path defined from left to right and from bottom to top. The fixed acquisition interval &#x394;<italic>x</italic>, &#x394;<italic>y</italic> is set to 0.1. For the vertical direction, a longitudinal distance sequence sampling method is used. The initial distance <italic>z</italic><sub>0</sub> is set to 1, the cutoff distance <italic>z<sub>end</sub></italic> is set to 10, and the acquisition interval &#x394;<italic>z</italic> is set to 1. This process simulates the continuous visual changes observed as the AUV approaches the target from a distance. The subspace number is used as the local visual localization ground truth, and image acquisition within all subspaces is completed sequentially in a clockwise order according to numbers 1&#x2013;8. Ultimately, the dataset comprises a total of 12,800 image sequences (8 subspaces &#xd7; 10 distance levels &#xd7; 160 horizontal sampling points). To assess the model performance, the dataset is randomly partitioned into training, validation, and test sets with a ratio of 6:2:2, as detailed in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>. During the division, balanced distribution across subspaces and distance levels is ensured. All data are collected under static, clear water conditions with constant illumination to establish baseline performance.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Dataset composition.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Samples</th>
<th valign="middle" align="center">Training</th>
<th valign="middle" align="center">Validation</th>
<th valign="middle" align="center">Test</th>
<th valign="middle" align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Quantity</td>
<td valign="middle" align="center">7680</td>
<td valign="middle" align="center">2560</td>
<td valign="middle" align="center">2560</td>
<td valign="middle" align="center">12800</td>
</tr>
<tr>
<td valign="middle" align="center">Proportion</td>
<td valign="middle" align="center">60%</td>
<td valign="middle" align="center">20%</td>
<td valign="middle" align="center">20%</td>
<td valign="middle" align="center">100%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The model architecture of this paper is structured as follows: Spatial Feature Extraction: The backbone of the CNN primarily consists of two Conv-ReLU-MaxPool modules, which are employed for the initial extraction of spatial features. To further enhance feature representation capability and gradient flow, residual blocks (ResBlocks) are introduced after these two modules, with each residual block comprising two convolutional layers and a skip connection. Finally, the network compresses the feature maps into a fixed-length feature vector through a Global Average Pooling (GAP) layer, replacing the traditional fully connected layers to reduce the number of parameters and improve generalization. The output of GAP can be directly regarded as a flattened feature representation. Spatio-temporal Feature Fusion: The spatial features extracted by the CNN are fed into a spatio-temporal attention Transformer encoder. This encoder employs eight attention heads with a hidden dimension of 64. A learnable temporal decay factor (initialized to 0.95) is introduced to model the relative importance of features at different time steps within the sequence. The parameters selected for the training process are presented in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>. All experiments were conducted through offline training on a server equipped with an Nvidia RTX 3090 GPU. The trained model achieved an inference speed of 100 fps on the test set, meeting the real-time requirements. To ensure the statistical reliability of the results and to evaluate their variance, five independent repeated training-testing runs were conducted, each initialized with different random seeds for model weights and data loading order.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Neural network training parameters.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Parameter</th>
<th valign="middle" align="center">Value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Optimizer</td>
<td valign="middle" align="center">AdamW</td>
</tr>
<tr>
<td valign="middle" align="center">Learning rate scheduler</td>
<td valign="middle" align="center">CosineAnnealingLR</td>
</tr>
<tr>
<td valign="middle" align="center">Training epoch</td>
<td valign="middle" align="center">50</td>
</tr>
<tr>
<td valign="middle" align="center">Batch size</td>
<td valign="middle" align="center">32</td>
</tr>
<tr>
<td valign="middle" align="center">Loss function</td>
<td valign="middle" align="center">CrossEntropyLoss</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The final reported performance is the average of these five runs, yielding a mean accuracy of 98.52%. The narrow confidence interval indicates stable model performance and high reproducibility. The confusion matrix, shown in <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>, provides a further analysis of model performance. The results reveal that the classification accuracy across all eight subspaces ranges between 98.1% and 98.8%, demonstrating a relatively balanced distribution without significant performance bias toward any specific subspace (maximum inter-class difference <italic>&lt;</italic> 5%). This suggests that the model exhibits good and consistent discriminative ability across all subspaces, with no clear signs of overfitting or underfitting observed. The generalization performance is thus validated under the experimental conditions of this study.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Confusion matrix.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g009.tif">
<alt-text content-type="machine-generated">Confusion matrix for subspace classification displays numbers along a diagonal line indicating correct predictions, with values ranging from one hundred seventy-one to two hundred twelve. Misclassifications are represented off-diagonal. A color gradient from light to dark blue indicates values from zero to over two hundred.</alt-text>
</graphic></fig>
<p>The loss and accuracy curves over the first 20 training epochs are presented in <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>. The synchronous and rapid descent of both the training and validation loss functions during the initial phases demonstrated efficient model learning and robust generalization capabilities, successfully mitigating significant overfitting. Performance metrics, specifically Top-1 and Top-5 accuracy, exhibited a rapid ascent, with the Top-5 accuracy notably converging quickly toward its theoretical maximum. Crucially, by the 20th epoch, all characteristic curves had plateaued, confirming complete convergence of the training process and the achievement of a stable, steady-state performance level.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Loss and accuracy curves.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g010.tif">
<alt-text content-type="machine-generated">Four line charts showing training and validation metrics over twenty epochs. The top left chart displays train loss decreasing from two to below 0.5. Top right chart shows metrics accuracy top one increasing from 0.3 to nearly one. Bottom left chart depicts validation loss dropping from 1.75 to below 0.25. Bottom right chart illustrates metrics accuracy top five rising from 0.9 to near one. The charts include blue lines representing results and orange dotted lines for smooth data.</alt-text>
</graphic></fig>
<p>To verify the real-time performance of the proposed model in the AUV local visual localization task, systematic comparative experiments were designed and conducted. Two representative baseline models in the field of spatiotemporal sequence processing were selected for comparison: 3D-CNN Model: As an end-to-end spatiotemporal feature extractor, 3D-CNN directly performs three-dimensional convolution operations on the input image sequence (dimensions: time &#xd7; height &#xd7; width &#xd7; channels), aiming to simultaneously capture spatial appearance and temporal dynamic information. LSTM Model: Frame-by-frame images are fed into an LSTM network to model their temporal dependencies. All comparative experiments were conducted under identical dataset splits (training/validation/test sets) and the same training hyperparameters (as shown in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>, including optimizer, learning rate, number of epochs, etc.) to ensure a fair comparison. Each compared model was independently trained five times, and its average performance is reported. Detailed comparative results are presented in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Comparison of real-time performance for different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Numbers</th>
<th valign="middle" align="center">Speed</th>
<th valign="middle" align="center">Time</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">3DCNN</td>
<td valign="middle" align="center">95.72%</td>
<td valign="middle" align="center">15.30m</td>
<td valign="middle" align="center">42fps</td>
<td valign="middle" align="center">3.8h</td>
</tr>
<tr>
<td valign="middle" align="center">LSTM</td>
<td valign="middle" align="center">97.02%</td>
<td valign="middle" align="center">8.70m</td>
<td valign="middle" align="center">78fps</td>
<td valign="middle" align="center">2.7h</td>
</tr>
<tr>
<td valign="middle" align="center">OURS</td>
<td valign="middle" align="center">98.52%</td>
<td valign="middle" align="center">5.20m</td>
<td valign="middle" align="center">100fps</td>
<td valign="middle" align="center">1.5h</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The proposed model demonstrates significant advantages in classification accuracy, model lightweighting, inference speed, and training efficiency. Its average accuracy reaches 98.52%, which is approximately 2.8 and 1.5 percentage points higher than that of 3D-CNN and LSTM, respectively. The minimal overlap in statistical confidence intervals indicates that the performance improvement is statistically significant. With only 5.2 million parameters, the model is considerably more compact than 3D-CNN (15.3 million) and LSTM (8.7 million). This efficiency is attributed to the effective representational capacity of the Transformer attention mechanism, enabling high performance while maintaining a lightweight architecture suitable for deployment on AUV embedded platforms with limited computational and storage resources. In terms of inference speed, the model achieves 100 fps, significantly outperforming 3D-CNN (42 fps) and LSTM (78 fps). Furthermore, the training process requires only 1.5 hours, demonstrating higher efficiency compared to 3D-CNN (3.8 hours) and highlighting the model&#x2019;s enhanced overall practicality. To further validate the real-time inference performance of the model on embedded platforms, the models were deployed in the edge computer (Nvidia Jetson Orin NX) carried by the AUV, which offers a computational capacity of 100 TOPS. Considering that the camera mounted on the AUV operates at a frame rate of 20 fps and the system must reserve computational resources for other functions, an image processing capability of 30 fps was set as the real-time safety threshold. In the embedded environment, the model proposed in this paper achieves a stable inference frame rate of 30 fps, meeting the dual requirements of real-time performance and computational headroom. In contrast, the LSTM method only reaches approximately 20 fps, while the 3DCNN method achieves about 13 fps, both of which fail to satisfy real-time processing demands while ensuring sufficient system headroom.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Experimental validation</title>
<p>While the algorithm was validated online, an ablation study on the temporal component was conducted, as presented in <xref ref-type="table" rid="T6"><bold>Table&#xa0;6</bold></xref>, to demonstrate the effectiveness of the spatio-temporal attention mechanism in local visual localization (<xref ref-type="bibr" rid="B8">Hille et&#xa0;al., 2023</xref>). The image acquisition frequency is set at 20 fps, with each sequence comprising 20 frames, corresponding to a one-second marker localization cycle. This design ensures both the integrity of the marker localization cycle and the consistency between sequence construction and offline training.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Comparison experiments.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Category</th>
<th valign="middle" align="center">Features</th>
<th valign="middle" align="center">Weight</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Group 1</td>
<td valign="middle" align="center">Spatial</td>
<td valign="middle" align="center">1</td>
</tr>
<tr>
<td valign="middle" align="center">Group 2</td>
<td valign="middle" align="center">Average</td>
<td valign="middle" align="center"><italic>&#x3b2;</italic> = 1<italic>/n</italic></td>
</tr>
<tr>
<td valign="middle" align="center">Group 3</td>
<td valign="middle" align="center">Exponentially decaying</td>
<td valign="middle" align="center"><italic>&#x3b2;</italic> = 1<italic>/</italic>2<italic><sup>n</sup></italic></td>
</tr>
<tr>
<td valign="middle" align="center">Our Algorithm</td>
<td valign="middle" align="center">Spatio-Temporal attention</td>
<td valign="middle" align="center">Training</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Baseline Model (Comparison Group 1): A static single-frame input architecture was employed, which extracted spatial features exclusively without incorporating temporal information, serving as the fundamental control group. Uniformly Weighted Temporal Model (Comparison Group 2): The input consisted of a sequence of 20 consecutive image frames, and an equal-weight aggregation strategy (Strategy <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:mi>&#x3b2;</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mn>20</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:math></inline-formula>) was adopted. This model assumed equal contribution of features across all time steps, representing the simplest form of temporal integration. Exponentially Decayed Temporal Model (Comparison Group 3): A time decay factor was introduced to construct a temporal architecture (Architecture <italic>&#x3b2;<sub>n</sub></italic><sub>+1</sub>&#xa0;= <italic>&#x3b2;<sub>n</sub>/</italic>2) with weights decreasing in a geometric progression. In this model, higher weights were assigned to more recent frames, while the weights of subsequent frames decayed according to a half-life pattern.</p>
<p>To validate the universality of the proposed algorithm in three-dimensional space, the basic motion trajectory was first determined in the x&#x2013;y plane, as illustrated in <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref>. The AUV maintained uniform motion within each lap. The first, second, and third laps were positioned at the 1/4, 2/4, and 3/4 points between the visual boundary region and the visually effective region, respectively. A total of three laps were executed.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Schematic diagram of AUV motion trajectory.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g011.tif">
<alt-text content-type="machine-generated">Concentric squares illustrate subspace classifications. A shaded area indicates a visually effective area, outlined by a bold border denoting the visually boundary area. Blue, green, and red lines represent increasing circle velocities (v1, v2, v3) labeled as first, second, and third circles. An arrowed axis is labeled X.</alt-text>
</graphic></fig>
<p>In four independent experiments, the z-axis coordinate was fixed at positions 2, 4, 6, and 8, respectively. In each experiment, motion was initiated from Subspace 1 and proceeded sequentially in a clockwise direction according to the subspace numbering sequence. Unlike image acquisition in offline scenarios, during online validation, the position calculated by the inertial navigation unit of the AUV was used as the ground truth for local visual localization. The results of the local visual localization are presented in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>. Accuracy is defined as the percentage of correctly identified marker subspaces relative to the total number of identification attempts.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Comparative table of local visual guidance accuracy results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Num</th>
<th valign="middle" align="center">z</th>
<th valign="middle" align="center">Group1</th>
<th valign="middle" align="center">Group2</th>
<th valign="middle" align="center">Group3</th>
<th valign="middle" align="center">Our algorithm</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">72.23%</td>
<td valign="middle" align="center">71.07%</td>
<td valign="middle" align="center">86.78%</td>
<td valign="middle" align="center">93.27%</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">71.67%</td>
<td valign="middle" align="center">69.01%</td>
<td valign="middle" align="center">85.35%</td>
<td valign="middle" align="center">91.63%</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">70.06%</td>
<td valign="middle" align="center">67.06%</td>
<td valign="middle" align="center">83.72%</td>
<td valign="middle" align="center">90.27%</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">68.67%</td>
<td valign="middle" align="center">66.93%</td>
<td valign="middle" align="center">80.72%</td>
<td valign="middle" align="center">87.62%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Compared to the baseline model (Group 1), Group 2, which utilized a uniformly weighted temporal model, was found to suffer from distant-frame interference, resulting in a decrease in accuracy. In contrast, a significant improvement in accuracy was observed for Group 3, where an exponentially decayed temporal model was employed relative to the baseline model (Group 1). In this study, an adaptive temporal decay mechanism was further implemented via a multi-head attention framework to extract spatiotemporal features from multiple dimensions. Experimental results demonstrate that the proposed method achieves superior accuracy compared to the exponentially decayed model (Group 3). Local visual localization is regarded as a continuous process. Following the comparison of accuracy rates, a further comparative analysis was conducted in the temporal domain. A graph was plotted with the subspace index as the horizontal axis and the number of consecutive five recognition errors as the vertical axis.</p>
<p>Curves corresponding to four comparative experimental groups, as illustrated in <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref>, were generated. Separate curves were plotted for z values of 2, 4, 6, and 8. Additionally, the total number of consecutive five recognition errors was summarized and is presented in <xref ref-type="table" rid="T8"><bold>Table&#xa0;8</bold></xref>. The baseline model (Group 1), which relied solely on spatial feature extraction, was minimally influenced by temporal continuity. Misidentified events primarily occurred as isolated incidents, with a low frequency of consecutive errors. Although the uniformly weighted temporal model (Group 2) mitigated the occurrence of spurious recognition spikes, an increase in consecutive errors was observed. The exponentially decayed temporal model (Group 3) further suppressed these spurious spikes; however, a higher rate of consecutive errors persisted, indicating that the fusion of spatiotemporal features remained insufficient.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Experimental apparatus and environment. <bold>(A)</bold> z=2. <bold>(B)</bold> z=4. <bold>(C)</bold> z=6. <bold>(D)</bold> z=8.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1774551-g012.tif">
<alt-text content-type="machine-generated">Four line graphs labeled A, B, C, and D compare the number of 5+ consecutive errors by subspace index across four groups: Comparison Group 1 (red), Comparison Group 2 (blue), Comparison Group 3 (green), and &#x201c;Our Algorithm&#x201d; (purple). Each graph shows variations in error counts across subspaces 1 to 8, highlighting trends and differences among the groups.</alt-text>
</graphic></fig>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>Total count of consecutive five recognition errors.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Num</th>
<th valign="middle" align="center">z</th>
<th valign="middle" align="center">Group1</th>
<th valign="middle" align="center">Group2</th>
<th valign="middle" align="center">Group3</th>
<th valign="middle" align="center">Our algorithm</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">27</td>
<td valign="middle" align="center">169</td>
<td valign="middle" align="center">143</td>
<td valign="middle" align="center">34</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">31</td>
<td valign="middle" align="center">154</td>
<td valign="middle" align="center">130</td>
<td valign="middle" align="center">26</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">30</td>
<td valign="middle" align="center">156</td>
<td valign="middle" align="center">144</td>
<td valign="middle" align="center">27</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">24</td>
<td valign="middle" align="center">147</td>
<td valign="middle" align="center">163</td>
<td valign="middle" align="center">31</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In contrast, the algorithm proposed in this work effectively integrates spatiotemporal features. This integration not only enhances the recognition accuracy of spatial features but also optimizes the inter-frame temporal relationships. A significant reduction in consecutive errors was consequently achieved, validating the superiority of the proposed spatiotemporal joint modeling approach.</p>
</sec>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusion</title>
<p>A line-light marker localization method for AUV visual docking is proposed, which enables robust inference of marker orientation beyond the effective visual field through a spatiotemporal attention-enhanced convolutional neural network framework. The method first extracts spatial visual features from single-frame images using a convolutional neural network, then integrates and enhances sequential frame features by incorporating a spatiotemporal attention module with learnable temporal decay coefficients and multi-layer attention mechanisms. Experiments demonstrate that this framework significantly improves the system&#x2019;s capability for continuous tracking and localization of markers in complex underwater environments, such as under varying illumination or partial occlusion, providing a stable and reliable visual guidance solution for AUV docking.</p>
<p>Deficiencies and future improvements:</p>
<list list-type="order">
<list-item>
<p>Limitations in the Applicability of Markers. Current methods require the camera and the line light source marker to maintain a specific relative pose, such as parallel alignment along the Z-axis, which restricts their application in complex motions like heading adjustments. These approaches are primarily suitable for the translational phase of static docking. Future research could explore methods that accommodate a wider range of pose variations to expand potential application scenarios.</p></list-item>
<list-item>
<p>Computational Efficiency of Spatial Feature Extraction. The spatial feature extraction module exhibits computational redundancy under static or low-speed motion conditions. Future implementations could incorporate an inter-frame difference detection mechanism, where consecutive frames with minimal variations would bypass redundant computations by directly reusing features from the previous frame, thereby enhancing overall efficiency.</p></list-item>
<list-item>
<p>Dynamic Adaptability of Spatiotemporal Feature Fusion. The time decay coefficient in the spatiotemporal attention mechanism is currently learned statically, making it difficult to adapt to dynamic motion scenarios. In the future, a dynamic time decay mechanism could be introduced to adaptively adjust the decay rate based on the input content, thereby enhancing robustness in dynamic environments.</p></list-item>
</list>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p></sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>RX: Data curation, Software, Writing &#x2013; original draft. LZ: Formal analysis, Project administration, Writing &#x2013; review &amp; editing. BH:&#xa0;Validation, Writing &#x2013; review &amp; editing. GH: Methodology, Software, Writing &#x2013; review &amp; editing. LL: Data curation, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s9" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Capone</surname> <given-names>V.</given-names></name>
<name><surname>Casolaro</surname> <given-names>A.</given-names></name>
<name><surname>Camastra</surname> <given-names>F.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Spatio-temporal prediction using graph neural networks: A survey</article-title>. <source>Neurocomputing</source> <volume>643</volume>, <fpage>130400</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2025.130400</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>C.-F. R.</given-names></name>
<name><surname>Fan</surname> <given-names>Q.</given-names></name>
<name><surname>Panda</surname> <given-names>R.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Crossvit: Cross-attention multi-scale vision transformer for image classification</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>. (<publisher-loc>Los Alamitos, CA</publisher-loc>: 
<publisher-name>IEEE/CVF (The Institute of Electrical and Electronics Engineers / Computer Vision Foundation)</publisher-name>), <fpage>357</fpage>&#x2013;<lpage>366</lpage>.
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dhiman</surname> <given-names>C.</given-names></name>
<name><surname>Vishwakarma</surname> <given-names>D. K.</given-names></name>
<name><surname>Agarwal</surname> <given-names>P.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Part-wise spatio-temporal attention driven cnn-based 3d human action recognition</article-title>. <source>ACM Trans. Multimidia Computing Commun. Appl.</source> <volume>17</volume>, <fpage>1</fpage>&#x2013;<lpage>24</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3441628</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gu</surname> <given-names>N.</given-names></name>
<name><surname>Wang</surname> <given-names>D.</given-names></name>
<name><surname>Peng</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Han</surname> <given-names>Q.-L.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Advances in line-of-sight guidance for path following of autonomous marine vehicles: An overview</article-title>. <source>IEEE Trans. Systems Man Cybernetics: Syst.</source> <volume>53</volume>, <fpage>12</fpage>&#x2013;<lpage>28</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TSMC.2022.3162862</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Han</surname> <given-names>F.</given-names></name>
<name><surname>Yao</surname> <given-names>J.</given-names></name>
<name><surname>Zhu</surname> <given-names>H.</given-names></name>
<name><surname>Wang</surname> <given-names>C.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Underwater image processing and object detection based on deep cnn method</article-title>. <source>J. Sensors</source> <volume>2020</volume>, <fpage>6707328</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1155/2020/6707328</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>S.</given-names></name>
<name><surname>Dai</surname> <given-names>S.-L.</given-names></name>
<name><surname>Zhao</surname> <given-names>Z.</given-names></name>
<name><surname>Zou</surname> <given-names>T.</given-names></name>
<name><surname>Ma</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Ude-based distributed formation control for msvs with collision avoidance and connectivity preservation</article-title>. <source>IEEE Trans. Ind. Inf.</source> <volume>20</volume>, <fpage>1476</fpage>&#x2013;<lpage>1487</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TII.2023.3274234</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>S.</given-names></name>
<name><surname>Wang</surname> <given-names>M.</given-names></name>
<name><surname>Dai</surname> <given-names>S.-L.</given-names></name>
<name><surname>Luo</surname> <given-names>F.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>Leader&#x2013;follower formation control of usvs with prescribed performance and collision avoidance</article-title>. <source>IEEE Trans. Ind. Inf.</source> <volume>15</volume>, <fpage>572</fpage>&#x2013;<lpage>581</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TII.2018.2839739</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hille</surname> <given-names>G.</given-names></name>
<name><surname>Agrawal</surname> <given-names>S.</given-names></name>
<name><surname>Tummala</surname> <given-names>P.</given-names></name>
<name><surname>Wybranski</surname> <given-names>C.</given-names></name>
<name><surname>Pech</surname> <given-names>M.</given-names></name>
<name><surname>Surov</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Joint liver and hepatic lesion segmentation in mri using a hybrid cnn with transformer layers</article-title>. <source>Comput. Methods Programs Biomedicine</source> <volume>240</volume>, <fpage>107647</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cmpb.2023.107647</pub-id>, PMID: <pub-id pub-id-type="pmid">37329803</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hsu</surname> <given-names>T.-C.</given-names></name>
<name><surname>Liao</surname> <given-names>Y.-S.</given-names></name>
<name><surname>Huang</surname> <given-names>C.-R.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Video summarization with spatiotemporal vision transformer</article-title>. <source>IEEE Trans. Image Process.</source> <volume>32</volume>, <fpage>3013</fpage>&#x2013;<lpage>3026</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIP.2023.3275069</pub-id>, PMID: <pub-id pub-id-type="pmid">37186532</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jin</surname> <given-names>K.</given-names></name>
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>YI</surname> <given-names>H.</given-names></name>
<name><surname>Liu</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>Key technologies and intelligence evolution of maritime uv</article-title>. <source>Chin. J. Ship Res.</source> <volume>13</volume>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:&#xa0; doi:&#xa0;<pub-id pub-id-type="doi">10.19693/j.issn.1673-3185.01293</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jyothi</surname> <given-names>V. B. N.</given-names></name>
<name><surname>Akash</surname> <given-names>S. J.</given-names></name>
<name><surname>Ramadass</surname> <given-names>G. A.</given-names></name>
<name><surname>Vedachalam</surname> <given-names>N.</given-names></name>
<name><surname>Venkataraman</surname> <given-names>H.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Design and development of deep learning-aided vision guidance system for auv homing applications</article-title>. <source>IEEE Embedded Syst. Lett.</source> <volume>16</volume>, <fpage>198</fpage>&#x2013;<lpage>201</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LES.2023.3339145</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Jiang</surname> <given-names>Y.</given-names></name>
<name><surname>Cao</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>B.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2015</year>). 
<article-title>Auv docking experiments based on vision positioning using two cameras</article-title>. <source>Ocean Eng.</source> <volume>110</volume>, <fpage>163</fpage>&#x2013;<lpage>173</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.oceaneng.2015.10.015</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Liu</surname> <given-names>W.</given-names></name>
<name><surname>Gong</surname> <given-names>G.</given-names></name>
<name><surname>Ding</surname> <given-names>X.</given-names></name>
<name><surname>Zhong</surname> <given-names>X.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Su-yolo: Spiking neural network for efficient underwater object detection</article-title>. <source>Neurocomputing</source>, <fpage>130310</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2025.130310</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Tu</surname> <given-names>Z.</given-names></name>
<name><surname>Lyu</surname> <given-names>M. R.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>On the diversity of multi-head attention</article-title>. <source>Neurocomputing</source> <volume>454</volume>, <fpage>14</fpage>&#x2013;<lpage>24</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2021.04.038</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Zhu</surname> <given-names>H.</given-names></name>
<name><surname>Song</surname> <given-names>W.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Yan</surname> <given-names>L.</given-names></name>
<name><surname>Wang</surname> <given-names>K.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Research on improved vgg-16 model based on transfer learning for acoustic image recognition of underwater search and rescue targets</article-title>. <source>IEEE J. Selected Topics Appl. Earth Observations Remote Sens</source>. <volume>17</volume>, <page-range>18112&#x2013;18128</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSTARS.2024.3459928</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lu</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>G.</given-names></name>
<name><surname>Sun</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>Adaptive cooperative formation control of autonomous surface vessels with uncertain dynamics and external disturbances</article-title>. <source>Ocean Eng.</source> <volume>167</volume>, <fpage>36</fpage>&#x2013;<lpage>44</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.oceaneng.2018.08.020</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ma</surname> <given-names>T.</given-names></name>
<name><surname>Ding</surname> <given-names>S.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Fan</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>A review of terrain aided navigation for underwater vehicles</article-title>. <source>Ocean Eng.</source> <volume>281</volume>, <fpage>114779</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.oceaneng.2023.114779</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Negahdaripour</surname> <given-names>S.</given-names></name>
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
<name><surname>Firoozfam</surname> <given-names>P.</given-names></name>
<name><surname>Oles</surname> <given-names>J.</given-names></name>
</person-group> (<year>2001</year>). &#x201c;
<article-title>Utilizing panoramic views for visually guided tasks in underwater robotics applications</article-title>,&#x201d; in <source>MTS/IEEE Oceans 2001. An Ocean Odyssey. Conference Proceedings (IEEE Cat. No. 01CH37295)</source>, vol. <volume>4</volume>. (<publisher-loc>Honolulu, HI, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>2593</fpage>&#x2013;<lpage>2600</lpage>.
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Pearson</surname> <given-names>D.</given-names></name>
<name><surname>An</surname> <given-names>E.</given-names></name>
<name><surname>Dhanak</surname> <given-names>M.</given-names></name>
<name><surname>von Ellenrieder</surname> <given-names>K.</given-names></name>
<name><surname>Beaujean</surname> <given-names>P.</given-names></name>
</person-group> (<year>2014</year>). <source>High-level fuzzy logic guidance system for an unmanned surface vehicle (USV) tasked to perform autonomous launch and recovery (ALR) of an autonomous underwater vehicle (AUV)</source> (<publisher-loc>Oxford, MS, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>).
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Peng</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>D.</given-names></name>
<name><surname>Li</surname> <given-names>T.</given-names></name>
<name><surname>Han</surname> <given-names>M.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Output-feedback cooperative formation maneuvering of autonomous surface vehicles with connectivity preservation and collision avoidance</article-title>. <source>IEEE Trans. cybernetics</source> <volume>50</volume>, <fpage>2527</fpage>&#x2013;<lpage>2535</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TCYB.2019.2914717</pub-id>, PMID: <pub-id pub-id-type="pmid">31180878</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Peng</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>D.</given-names></name>
<name><surname>Han</surname> <given-names>Q.-L.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>An overview of recent advances in coordinated control of multiple autonomous surface vehicles</article-title>. <source>IEEE Trans. Ind. Inf.</source> <volume>17</volume>, <fpage>732</fpage>&#x2013;<lpage>745</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TII.2020.3004343</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Teng</surname> <given-names>M.</given-names></name>
<name><surname>Ye</surname> <given-names>L.</given-names></name>
<name><surname>Yuxin</surname> <given-names>Z.</given-names></name>
<name><surname>Yanqing</surname> <given-names>J.</given-names></name>
<name><surname>Zheng</surname> <given-names>C.</given-names></name>
<name><surname>Qiang</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>An auv localization and path planning algorithm for terrain-aided navigation</article-title>. <source>ISA Trans.</source> <volume>103</volume>, <fpage>215</fpage>&#x2013;<lpage>227</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isatra.2020.04.007</pub-id>, PMID: <pub-id pub-id-type="pmid">32336466</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Vaswani</surname> <given-names>A.</given-names></name>
<name><surname>Shazeer</surname> <given-names>N.</given-names></name>
<name><surname>Parmar</surname> <given-names>N.</given-names></name>
<name><surname>Uszkoreit</surname> <given-names>J.</given-names></name>
<name><surname>Jones</surname> <given-names>L.</given-names></name>
<name><surname>Gomez</surname> <given-names>A. N.</given-names></name>
<etal/>
</person-group>. (<year>2017</year>). 
<article-title>Attention is all you need</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>30</volume>, <page-range>5998&#x2013;6008</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xing</surname> <given-names>R.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Han</surname> <given-names>G.</given-names></name>
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2025</year>a). 
<article-title>Reconfigurable line light sources sensing method for vision-based auv guidance</article-title>. <source>IEEE Sensors J</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSEN.2025.3571393</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Xing</surname> <given-names>R.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Han</surname> <given-names>G.</given-names></name>
<name><surname>Yang</surname> <given-names>C.</given-names></name>
<name><surname>Yuan</surname> <given-names>C.</given-names></name>
</person-group> (<year>2025</year>b). &#x201c;
<article-title>Research on underwater visual docking control method for dual-auvs</article-title>,&#x201d; in <source>2025 10th International Conference on Control and Robotics Engineering (ICCRE)</source> (<publisher-loc>Nagoya, Japan</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>190</fpage>&#x2013;<lpage>195</lpage>.
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Xing</surname> <given-names>R.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Han</surname> <given-names>G.</given-names></name>
<name><surname>Zhu</surname> <given-names>Z.</given-names></name>
<name><surname>Chen</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2024</year>). &#x201c;
<article-title>A visual perception algorithm for auv dynamic docking based on variable markers</article-title>,&#x201d; in <source>OCEANS 2024-Halifax</source> (<publisher-loc>Halifax, NS, Canada</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>.
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>Z.</given-names></name>
<name><surname>Haroutunian</surname> <given-names>M.</given-names></name>
<name><surname>Murphy</surname> <given-names>A. J.</given-names></name>
<name><surname>Neasham</surname> <given-names>J.</given-names></name>
<name><surname>Norman</surname> <given-names>R.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>An underwater visual navigation method based on multiple aruco markers</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>9</volume>, <fpage>1432</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse9121432</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>H.</given-names></name>
<name><surname>Ling</surname> <given-names>Z.</given-names></name>
<name><surname>Yuan</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>A video object detector with spatio-temporal attention module for micro uav detection</article-title>. <source>Neurocomputing</source> <volume>597</volume>, <fpage>127973</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2024.127973</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>X.</given-names></name>
<name><surname>Lu</surname> <given-names>Y.</given-names></name>
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Intelligent collision avoidance algorithms for usvs via deep reinforcement learning under colregs</article-title>. <source>Ocean Eng.</source> <volume>217</volume>, <fpage>107704</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.oceaneng.2020.107704</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yan</surname> <given-names>J.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Yang</surname> <given-names>X.</given-names></name>
<name><surname>Chen</surname> <given-names>C.</given-names></name>
<name><surname>Guan</surname> <given-names>X.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Communication-aware motion planning of auv in obstacle-dense environment: A binocular vision-based deep learning method</article-title>. <source>IEEE Trans. Intelligent Transportation Syst.</source> <volume>24</volume>, <fpage>14927</fpage>&#x2013;<lpage>14943</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TITS.2023.3296415</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>D.</given-names></name>
<name><surname>Cheng</surname> <given-names>C.</given-names></name>
<name><surname>Wang</surname> <given-names>C.</given-names></name>
<name><surname>Pan</surname> <given-names>G.</given-names></name>
<name><surname>Zhang</surname> <given-names>F.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Side-scan sonar image segmentation based on multi-channel cnn for auv navigation</article-title>. <source>Front. Neurorobotics</source> <volume>16</volume>, <elocation-id>928206</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fnbot.2022.928206</pub-id>, PMID: <pub-id pub-id-type="pmid">35928729</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhao</surname> <given-names>X.</given-names></name>
<name><surname>Wei</surname> <given-names>H.</given-names></name>
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Zhu</surname> <given-names>T.</given-names></name>
<name><surname>Zhang</surname> <given-names>K.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>3d-cnn-based feature extraction of ground-based cloud images for direct normal irradiance prediction</article-title>. <source>Solar Energy</source> <volume>181</volume>, <fpage>510</fpage>&#x2013;<lpage>518</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.solener.2019.01.096</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>H.</given-names></name>
<name><surname>Cui</surname> <given-names>Z.</given-names></name>
<name><surname>Liu</surname> <given-names>J.</given-names></name>
<name><surname>Jiang</surname> <given-names>S.</given-names></name>
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>A method for inverting shallow sea acoustic parameters based on the backward feedback neural network model</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>11</volume>, <fpage>1340</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse11071340</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zilu</surname> <given-names>O.</given-names></name>
<name><surname>Hongdong</surname> <given-names>W.</given-names></name>
<name><surname>Yi</surname> <given-names>H.</given-names></name>
<name><surname>Kaiwen</surname> <given-names>Y.</given-names></name>
<name><surname>Hong</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Path planning technologies for usv formation based on improved rrt</article-title>. <source>Chin. J. Ship Res.</source> <volume>15</volume>, <fpage>18</fpage>&#x2013;<lpage>24</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.19693/j.issn.1673-3185.01639</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2076292">Chengbo Wang</ext-link>, Xidian University, China</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2035831">Gaofei Xu</ext-link>, Chinese Academy of Sciences (CAS), China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2599614">Tingkai Chen</ext-link>, Dalian Maritime University, China</p></fn>
</fn-group>
</back>
</article>