<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="methods-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2026.1778827</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Methods</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>eAodeMar: a lightweight and real-time occluded marine vessel detection network for embedded marine platforms</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Wang</surname><given-names>Yuanyuan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1777316/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname><given-names>Mingyu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Shi</surname><given-names>Jianqiang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ni</surname><given-names>Zuo</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname><given-names>Guobin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>International Navigation College, Hainan Tropical Ocean University</institution>, <city>Sanya</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Yazhou Bay Innovation Institute of Hainan Tropical Ocean University</institution>, <city>Sanya</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>Weihai Guangtai Airport Equipment Co., Ltd.</institution>, <city>Weihai</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Mingyu Wang, <email xlink:href="mailto:vicsee@163.com">vicsee@163.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-27">
<day>27</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1778827</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>25</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Wang, Wang, Shi, Ni and Li.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Wang, Wang, Shi, Ni and Li</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-27">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The detection of occluded marine vessels is critical for the safe navigation and operation of unmanned surface vehicles (USVs). While image-based detection methods have achieved substantial accuracy, their high computational and memory requirements prohibit deployment on resource-constrained embedded platforms. To address this, we propose eAodeMar (efficient AodeMar), a lightweight version built upon our prior AodeMar model, specifically designed for efficient occluded marine vessel detection.</p>
</sec>
<sec>
<title>Methods</title>
<p>The efficiency of eAodeMar is achieved by integrating Ghost convolution modules into both the backbone and the feature fusion network, significantly reducing model parameters and computational load while maintaining accuracy. To ensure practical applicability, the optimized model is deployed on an embedded GPU platform (Jetson Xavier NX), incorporating dedicated structural refinement and inference acceleration techniques.</p>
</sec>
<sec>
<title>Results</title>
<p>Extensive experiments on the public MVDD13 dataset demonstrate that eAode- Mar reduces parameter count and computational load by 7.00% and 0.89%, respectively, with only a marginal accuracy drop of 0.42%, while achieving a remarkable 42.12% improvement in inference speed. When deployed on the Jetson Xavier NX device, it attains a real-time detection rate of 28.57 FPS on the SMD video stream.</p>
</sec>
<sec>
<title>Discussion</title>
<p>These comprehensive results validate that eAodeMar effectively balances high precision with high efficiency in occlusion-prone maritime environments. The model demonstrates strong potential for real-world ocean engineering applications, offering a practical solution for real-time detection on embedded systems.</p>
</sec>
</abstract>
<kwd-group>
<kwd>embedded platform</kwd>
<kwd>ghost convolution</kwd>
<kwd>light-weight network</kwd>
<kwd>marine vessel detection</kwd>
<kwd>real-time detection</kwd>
<kwd>unmanned surface vehicle</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work is supported by the Scientific Research Foundations of Hainan Tropical Ocean University under Grants RHDRCZK202526 and RHDRCZK202402.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="10"/>
<equation-count count="7"/>
<ref-count count="55"/>
<page-count count="14"/>
<word-count count="6664"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Ocean Observation</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>The advancement of unmanned platforms, including unmanned surface vessels (USVs) and aerial drones, has significantly transformed modern ocean observation and marine engineering operations (<xref ref-type="bibr" rid="B27">Liu et&#xa0;al., 2025</xref>). These platforms increasingly rely on embedded visual perception systems to achieve automated, real-time monitoring of maritime targets, particularly ships. This capability is critical for maritime traffic management (<xref ref-type="bibr" rid="B7">Gao et&#xa0;al., 2025</xref>), navigational safety, early hazard warning, environmental surveillance (<xref ref-type="bibr" rid="B38">Song et&#xa0;al., 2024</xref>), and collision avoidance (<xref ref-type="bibr" rid="B55">Zhao et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B33">Ong et&#xa0;al., 2022</xref>). However, the complex marine environments such as fog, spray, waves, and most notably, frequent inter-vessel occlusion which is highlighted as a primary challenge in specialized benchmarks like MVDD13 (<xref ref-type="bibr" rid="B46">Wang et&#xa0;al., 2024b</xref>), can severely degrade image quality and lead to blurred or even completely obscured target features, threats severe challenges to vision-based marine vessel detection (<xref ref-type="bibr" rid="B19">Kortylewski et&#xa0;al., 2020</xref>, <xref ref-type="bibr" rid="B18">2021</xref>). Occlusion is especially common in congested waterways and ports, substantially compromising the accuracy and reliability of existing detection models. Therefore, developing a ship detection method that maintains high accuracy under occlusion while meeting the stringent real-time and low-power constraints of embedded systems onboard unmanned platforms is crucial for advancing intelligent ocean observation and risk mitigation technologies.</p>
<p>The human visual system is capable of inferring the properties of objects based on the contours present in a scene, even when local information of the objects is occluded or lost (<xref ref-type="bibr" rid="B35">Rensink and Enns, 1998</xref>). However, it remains challenging for deep learning-based computer vision systems to effectively detect occluded objects (<xref ref-type="bibr" rid="B39">Sun et&#xa0;al., 2022</xref>). To address the occlusion problem, data augmentation methods such as Mosaic (<xref ref-type="bibr" rid="B53">Zeng et&#xa0;al., 2022</xref>), Cutout (<xref ref-type="bibr" rid="B11">Hinton et&#xa0;al., 2012</xref>) and CutMix (<xref ref-type="bibr" rid="B52">Yun et&#xa0;al., 2019</xref>), etc., have been proposed to only alleviate occlusion issues in certain extent. Some studies have designed specialized attention modules and contextual fusion mechanisms (<xref ref-type="bibr" rid="B15">Hu et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B34">Ranftl et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2021b</xref>; <xref ref-type="bibr" rid="B12">Hou et&#xa0;al., 2021</xref>) to improve the recognition of occluded targets. Occlusion can be regarded as a type of deformation problem. By extracting invariant features of ships, such as the gradient direction histogram feature based on the co-occurrence matrix (<xref ref-type="bibr" rid="B17">Kawahara et&#xa0;al., 2012</xref>) and constrained subspaces (<xref ref-type="bibr" rid="B29">Maki et&#xa0;al., 2002</xref>), local occlusion of ships can be addressed. However, these feature vectors are not universally applicable to all types of ships. <xref ref-type="bibr" rid="B42">Tang et&#xa0;al. (2022)</xref> combined occlusion detection strategies with matching algorithms to achieve continuous tracking after occlusion. However, it is not suitable for detecting occluded targets in static images. Recently, <xref ref-type="bibr" rid="B45">Wang et&#xa0;al. (2024a)</xref> proposed an attention-aware ship occlusion detection method and named AodeMar, to tackle the problem of feature confusion in occluded regions of ship targets. Specifically, a position enhancement module was constructed based on residual connections and coordinate attention (<xref ref-type="bibr" rid="B48">Wu et&#xa0;al., 2022</xref>). By fusing information from the horizontal and vertical spatial directions in the channel dimension, this module explicitly models interactions among positional features, thereby capturing long-range dependencies and strengthening feature representation for partially occluded targets. Additionally, a multi-scale feature semantic association method was proposed based on spatial pyramid pooling and sliding window self-attention encoders (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2021b</xref>). By sliding windows, this method establishes interactions between multi-scale target features within and across windows, enhancing the model&#x2019;s feature discrimination ability at both global and local levels. These enhancements typically further increase model complexity, making real-time inference on edge computing devices. Consequently, a noticeable gap exists between detection accuracy, occlusion robustness and deploy ability on embedded systems. This gap is evident even in advanced models designed for complex, multi-source maritime imagery (e.g., optical, SAR), which often prioritize accuracy at the expense of computational efficiency, hindering their deployments (<xref ref-type="bibr" rid="B49">Wu et&#xa0;al., 2025</xref>).</p>
<p>Lightweight network designs offer pathways toward efficiency. Current research in this domain mainly focuses on two aspects. The first involves designing lightweight architectural modules by refining convolution operations to reduce parameter counts. For instance, MobileNet series models decompose traditional convolutions into depthwise and pointwise ones, incorporating width and resolution multipliers to compress channel dimensions and input resolution, thereby significantly lowering parameter (<xref ref-type="bibr" rid="B14">Howard et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B36">Sandler et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B13">Howard et&#xa0;al., 2020</xref>). However, the extensive use of depthwise convolutions can increase computational load. To mitigate this, ShuffleNet introduced group pointwise convolutions and channel shuffle operations to promote cross-group feature interaction, enhancing representational capacity while maintaining efficiency (<xref ref-type="bibr" rid="B54">Zhang et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B28">Ma et&#xa0;al., 2018</xref>). EfficientNets achieve a balanced scaling of network width, depth, and resolution to optimize the accuracy-efficiency trade-off (<xref ref-type="bibr" rid="B40">Tan and Le, 2019</xref>, <xref ref-type="bibr" rid="B41">2021</xref>). More recently, GhostNet has gained attention for generating feature maps through cost-effective linear transformations, effectively revealing intrinsic features with reduced computational cost (<xref ref-type="bibr" rid="B9">Han et&#xa0;al., 2020</xref>). This design has been successfully applied in maritime contexts, such as in SAR image ship detection, to create compact models (<xref ref-type="bibr" rid="B50">Xiang et al., 2024</xref>). In maritime-specific applications, researchers have integrated depthwise separable convolution (DSC) (<xref ref-type="bibr" rid="B3">Chollet, 2017</xref>) into YOLOv4-based detectors to accelerate inference (<xref ref-type="bibr" rid="B26">Liu et&#xa0;al., 2021a</xref>), while others have combined ShuffleNetv2 backbones with SE attention modules and DSC-enhanced necks to achieve competitive accuracy on datasets such as SMD (<xref ref-type="bibr" rid="B51">Yang et&#xa0;al., 2022</xref>). A key realization in this field is that parameter count alone does not linearly correlate with practical inference speed. Consequently, recent efforts increasingly emphasize computational complexity, measured in FLOPs, as a more direct indicator of on-device performance (<xref ref-type="bibr" rid="B31">Molchanov et&#xa0;al., 2016</xref>). For example, MobileNet-SSD and its variants demonstrate that coupling lightweight backbones with efficient detection heads can yield favorable speed-accuracy profiles (<xref ref-type="bibr" rid="B14">Howard et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B36">Sandler et&#xa0;al., 2018</xref>). Nevertheless, these gains in efficiency often come at the cost of degraded feature extraction capability, particularly in challenging visual scenarios. The second research thrust adopts model compression techniques to prune redundant network structures. To strike a balance between detection speed and accuracy, scholars have explored various compression strategies, including precision quantization, structured pruning (<xref ref-type="bibr" rid="B22">Li et&#xa0;al., 2016</xref>), and knowledge distillation (<xref ref-type="bibr" rid="B47">Wang et&#xa0;al., 2019</xref>) (<xref ref-type="bibr" rid="B6">Gao et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B5">Deng et&#xa0;al., 2020</xref>). However, aggressive compression, as seen in YOLO-LITE (<xref ref-type="bibr" rid="B16">Huang et&#xa0;al., 2018</xref>), can lead to severe accuracy degradation. Weight-based pruning algorithms, which remove neurons with relatively small contributions, offer a more granular approach to maintaining performance while reducing model size (<xref ref-type="bibr" rid="B10">Han et&#xa0;al., 2015</xref>).</p>
<p>The pursuit of higher mAP and lower FLOPs on server GPUs does not translate directly to embedded systems. Achieving a viable triple balance among detection accuracy, processing speed, and robustness against environmental variations on constrained edge devices poses a significant, often unmet, challenge for existing methods. Driven by the need for real-time perception on USVs, embedded vision platforms have motivated the development of lightweight detection networks that reconcile throughput with deployability. Notable instances include eWaSR, an embedded-ready maritime obstacle segmentation network that maintains robust accuracy while achieving real-time execution on platforms (<xref ref-type="bibr" rid="B43">Tersek et&#xa0;al., 2023</xref>). Similarly, pruned and quantized YOLO variants have been widely adopted for ship detection, offering substantial reductions in parameters and FLOPs (<xref ref-type="bibr" rid="B1">Adarsh et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B44">Wang et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B8">Haijoub et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B23">Li and Wang, 2025</xref>). To realize the full potential of these algorithms on edge hardware, dedicated optimization tools (e.g., TensorRT) and quantization strategies have proven essential for achieving practical throughput and energy efficiency (<xref ref-type="bibr" rid="B30">Martin-Salinas et&#xa0;al., 2025</xref>). Further innovations incorporate efficient feature-fusion mechanisms and cascaded detection pipelines to accelerate inference on edge devices (<xref ref-type="bibr" rid="B25">Liu et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B37">Sankaran et&#xa0;al., 2023</xref>).</p>
<p>In summary, current approaches to the combined challenge of occluded vessel detection for embedded maritime platforms lack dedicated modeling of occlusion patterns intrinsic to the maritime environment, cannot simultaneously satisfy the competing demands of high fidelity, low latency, and operational resilience on edge devices, and lack conclusive proof of viability in real-world settings. To address these limitations, this paper introduces eAodeMar, an algorithm-hardware co-designed framework that is rigorously validated through real-world deployments. Specially, key contributions are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>A lightweight and embedded-ready detection network eAodeMar is proposed for embedded maritime platforms. By systematically integrating Ghost convolution modules into both backbone and feature fusion stages, the architecture achieves a significant reduction in parameters and FLOPs while retaining the capacity to recognize partially visible ships in cluttered maritime scenes.</p></list-item>
<list-item>
<p>A complete algorithm-to-deployment co-design pipeline is established to bridge the gap between algorithmic efficiency and practical hardware execution. The pipeline includes structural lightweighting, TensorRT-based graph optimization, mixed-precision quantization, and tailored deployment on the NVIDIA Jetson Xavier NX platform, ensuring sustained real-time performance under real-world power and memory constraints.</p></list-item>
<list-item>
<p>Extensive evaluations on the public MVDD13 dataset and real-world maritime video sequences are conducted. The results demonstrate that eAodeMar effectively navigates the accuracy-efficiency trade-off, achieving a significant improvement in inference speed (<inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:mo>&gt;</mml:mo><mml:mn>40</mml:mn><mml:mo>%</mml:mo></mml:mrow></mml:math></inline-formula>) with only a minimal detection accuracy drop, thereby validating its practicality for real-time onboard perception in occlusion-prone maritime scenarios.</p></list-item>
</list>
<p>The remainder of this paper is organized as follows. In Section 2, the proposed eAodeMar architecture and its lightweight design are presented. The embedded deployment of eAodeMar is detailed in Section 3. The experimental setup, ablation studies, and comparison results and corresponding analysis are provided in Section 4. Finally, conclusion and future work are drawn in Section 5.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>The eAodeMar architecture</title>
<sec id="s2_1">
<label>2.1</label>
<title>Architecture analysis</title>
<p>The factors contributing to the model&#x2019;s complexity are first analyzed in this section to guide the subsequent lightweight design. Two key metrics for evaluating model complexity are parameter count (<xref ref-type="bibr" rid="B21">Lecun et&#xa0;al., 1998</xref>, <xref ref-type="bibr" rid="B20">2015</xref>) and FLOPs (<xref ref-type="bibr" rid="B31">Molchanov et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B14">Howard et&#xa0;al., 2017</xref>), which directly reflect the model&#x2019;s expressive capacity and its practical deployment feasibility (<xref ref-type="bibr" rid="B2">Cheng et&#xa0;al., 2017</xref>).</p>
<p>In our earlier work, we have introduced a position enhancement module named RCAC3 and a multi-scale semantic association module named SP-STR, which have been used to replace the original C3 modules in the high-level layers of the backbone and neck networks, respectively. To assess their impact on model complexity, we first analyzed the changes in parameter count resulting from these replacements. As shown in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>, both modules increase the parameter count compared to their corresponding C3 counterparts. The RCAC3 module in the backbone network introduced a relatively modest increase, whereas the SP-STR module in the neck network led to a more significant expansion, particularly in its first layer, where the parameter count nearly doubled.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Computation analysis of module parameters.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Component</th>
<th valign="middle" align="center">Module</th>
<th valign="middle" align="center">Module parameter</th>
<th valign="middle" align="center">Parameter variation</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="2" align="center">Backbone network</td>
<td valign="middle" align="center">RCAC3</td>
<td valign="middle" align="center">1189400</td>
<td valign="middle" align="center">&#x2191;0.56%</td>
</tr>
<tr>
<td valign="middle" align="center">C3</td>
<td valign="middle" align="center">1182720</td>
<td valign="middle" align="left"/>
</tr>
<tr>
<td valign="middle" rowspan="4" align="center">Neck network</td>
<td valign="middle" align="center">SP-STR</td>
<td valign="middle" align="center">725508</td>
<td valign="middle" align="center">&#x2191;100.43%</td>
</tr>
<tr>
<td valign="middle" align="center">C3</td>
<td valign="middle" align="center">361984</td>
<td valign="middle" align="left"/>
</tr>
<tr>
<td valign="middle" align="center">SP-STR</td>
<td valign="middle" align="center">1974792</td>
<td valign="middle" align="center">&#x2191;66.97%</td>
</tr>
<tr>
<td valign="middle" align="center">C3</td>
<td valign="middle" align="center">1182720</td>
<td valign="middle" align="left"/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x201c;&#x2191;&#x201d; denotes increase trend.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>, a detailed analysis of the computational and performance impacts caused by replacing the two modules reveals several key findings. Compared to the original AodeMar model, replacing the RCAC3 module with a standard C3 module reduces the total parameter count by 0.81%. This modification leads to a reduction in detection accuracy, with mAP@.5 and mAP@[.5:.95] decreasing by 0.59% (0.56) and 1.90% (1.57), respectively, while the change in computational load remains negligible. Notably, however, this replacement results in a significant 33.80% improvement in inference speed (FPS). When the SP-STR module in the neck network is replaced with its C3 counterpart, the model exhibits a more pronounced reduction in parameter count down by 13.95%. The corresponding declines in detection accuracy are relatively modest, with mAP@.5 and mAP@[.5:.95] dropping by only 0.21% (0.20) and 0.17% (0.14), respectively. More strikingly, computational load is dramatically reduced to approximately one-fourth of the original, accompanied by an 11.72% increase in FPS.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The impact of RCAC3 and SP-STR on the AodeMar&#x2019;s computational load, detection accuracy, and speed.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Metrics</th>
<th valign="middle" align="center">AodeMar</th>
<th valign="middle" align="center">&#x2013;RCAC3</th>
<th valign="middle" align="center">&#x2013;SP-STR</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">mAP@.5/%</td>
<td valign="middle" align="center">95.43</td>
<td valign="middle" align="center">94.87 (&#x2193;0.56)</td>
<td valign="middle" align="center">95.23 (&#x2193;0.20)</td>
</tr>
<tr>
<td valign="middle" align="center">mAP@[.5:.95]/%</td>
<td valign="middle" align="center">82.57</td>
<td valign="middle" align="center">81.00 (&#x2193;1.57)</td>
<td valign="middle" align="center">82.43 (&#x2193;0.14)</td>
</tr>
<tr>
<td valign="middle" align="center">Total parameters</td>
<td valign="middle" align="center">8282502</td>
<td valign="middle" align="center">8275822 (&#x2193;0.81%)</td>
<td valign="middle" align="center">7126906 (&#x2193;13.95%)</td>
</tr>
<tr>
<td valign="middle" align="center">FLOPs/G</td>
<td valign="middle" align="center">67.40</td>
<td valign="middle" align="center">67.40</td>
<td valign="middle" align="center">16.30</td>
</tr>
<tr>
<td valign="middle" align="center">FPS</td>
<td valign="middle" align="center">61.73</td>
<td valign="middle" align="center">82.59</td>
<td valign="middle" align="center">68.97</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x201c;&#x2193;&#x201d; denotes decrease trend, and &#x201c;<inline-formula>
<mml:math display="inline" id="im2"><mml:mo>&#x2212;</mml:mo></mml:math></inline-formula>&#x201c; denotes that the module is replaced by C3 convolution.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>These results collectively demonstrate that both RCAC3 and SP-STR modules achieve marginal improvements in detection accuracy at the cost of substantial memory consumption and reduced inference efficiency. The SP-STR module, in particular, exerts a more significant influence on overall model performance. Consequently, optimizing these two modules is essential to enable lightweight, efficient deployment without compromising practical usability.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Lightweight occluded vessel detection network eAodeMar</title>
<p>Based on the analysis in the previous section, this section focuses on the lightweight design of the RCAC3 and SP-STR modules. In convolutional neural networks, input images are progressively transformed through a series of operations such as convolution and pooling into output feature maps. Conventional convolution layers, however, often incur substantial computational overhead. Moreover, the inherent properties of original convolution tend to generate a considerable number of redundant or highly similar feature maps during training, which further elevates computational complexity and model size without proportionally improving representational capacity.</p>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>Lightweight module design based on ghost convolution</title>
<p>Ghost convolution (<xref ref-type="bibr" rid="B9">Han et&#xa0;al., 2020</xref>), introduced by Huawei Noah&#x2019;s Ark Lab, provides an efficient alternative to original convolution by substantially reducing computational complexity while maintaining comparable representational capacity. To quantitatively demonstrate the efficiency advantage of Ghost convolution over original convolution, we analyze both parameter count and FLOPs. As shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>, let the input tensor be <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:mi mathvariant="script">X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>w</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>c</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, where <inline-formula>
<mml:math display="inline" id="im4"><mml:mi>h</mml:mi></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im5"><mml:mi>w</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im6"><mml:mi>c</mml:mi></mml:math></inline-formula> denote its height, width and number of input channels, respectively.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Structural diagrams contrasting the <bold>(a)</bold> Original convolution and <bold>(b)</bold> Ghost convolution, highlighting the reduction in redundant operations and parameters.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778827-g001.tif">
<alt-text content-type="machine-generated">Diagram comparing (a) an original convolutional layer and (b) a ghost convolutional layer using labeled 3D block stacks to illustrate input feature maps, filters, and output feature maps. The original layer shows a straightforward flow from input X to output Y via filters F. The ghost layer shows input X progressing through filters to generate partial outputs Y1, then applies depthwise operations to produce additional output maps, resulting in a combined output Y-tilde.</alt-text>
</graphic></fig>
<p>For original convolution, applying an operation <inline-formula>
<mml:math display="inline" id="im7"><mml:mi>F</mml:mi></mml:math></inline-formula> with <inline-formula>
<mml:math display="inline" id="im8"><mml:mi>n</mml:mi></mml:math></inline-formula> convolution kernels of size <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> can generate <inline-formula>
<mml:math display="inline" id="im10"><mml:mi>n</mml:mi></mml:math></inline-formula> feature maps. Ignoring the bias term, the output can be given by</p>
<disp-formula id="eq8">
<mml:math display="block" id="M8"><mml:mrow><mml:mi mathvariant="script">Y</mml:mi><mml:mo>=</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="script">X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:mi mathvariant="script">Y</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mo>&#xd7;</mml:mo><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mo>&#xd7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im12"><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im13"><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover></mml:math></inline-formula> denote height and width of the output feature maps, respectively. The number of parameters and FLOPs can be expressed by</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>o</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>n</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>o</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mo>&#xd7;</mml:mo><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mo>&#xd7;</mml:mo><mml:mi>c</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>n</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>o</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the number of parameters, <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>o</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> denotes FLOPs, <inline-formula>
<mml:math display="inline" id="im16"><mml:mi>c</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im17"><mml:mi>n</mml:mi></mml:math></inline-formula> represent the number of channels of input and output feature maps, respectively.</p>
<p>In Ghost convolution, only <inline-formula>
<mml:math display="inline" id="im18"><mml:mi>m</mml:mi></mml:math></inline-formula> intrinsic feature maps are generated via original convolution, where <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x226a;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:math></inline-formula>, and the remaining <inline-formula>
<mml:math display="inline" id="im20"><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:math></inline-formula> &#x201c;ghost&#x201d; feature maps are produced by applying low-cost depthwise convolutions with small kernels. Let the kernel size of the linear transformation be denoted as <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>&#xa0;</mml:mo></mml:mrow></mml:math></inline-formula>(typically <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>&#x2264;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>). The parameter count and FLOPs of Ghost convolution can be computed by</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>g</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>m</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>m</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>m</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>g</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mo>&#xd7;</mml:mo><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mi>c</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>m</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>m</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>m</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mtext>g</mml:mtext></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mtext>g</mml:mtext></mml:msub></mml:mrow></mml:math></inline-formula> denote parameters and FLOPs of the Ghost convolution, respectively. <inline-formula>
<mml:math display="inline" id="im25"><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im26"><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover></mml:math></inline-formula> represent height and width of the output feature maps, respectively.</p>
<p>In this context, the theoretical compression ratio <inline-formula>
<mml:math display="inline" id="im27"><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> in terms of parameters and FLOPs can be approximated as:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>&#x2248;</mml:mo><mml:mfrac><mml:mi>m</mml:mi><mml:mi>n</mml:mi></mml:mfrac><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>m</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:mo>.</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>In practice, setting <inline-formula>
<mml:math display="inline" id="im28"><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x2248;</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:math></inline-formula> and using small linear kernels (e.g., <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:math></inline-formula>) typically yields a reduction in both parameters and FLOPs by approximately half compared to original convolution, without significant loss in accuracy.</p>
<p>This intrinsic efficiency makes Ghost convolution particularly suitable for deploying detection models on resource-constrained embedded platforms, where minimizing computational overhead and memory footprint is paramount. By replacing the original convolutions in the RCAC3 and SP-STR modules with Ghost convolutions, a markedly compact architecture can be obtained.</p>
<p>Remark 1. Note that <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x2248;</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:math></inline-formula> adheres to the efficient design established in GhostNet <xref ref-type="bibr" rid="B9">Han et&#xa0;al. (2020)</xref>. Moreover, this design leads to a final model that achieves a significant reduction in parameters and FLOPs while preserving the accuracy necessary for occluded vessel detection, as will be demonstrated in the following Results and Analysis section.</p>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Network architecture of eAodeMar</title>
<p>To simultaneously improve detection speed while maintaining the original modules&#x2019; performance as much as possible, we employ Ghost convolution (<xref ref-type="bibr" rid="B9">Han et&#xa0;al., 2020</xref>) for lightweight redesign, balancing efficiency with representational capacity. For clarity, the two resulting lightweight modules are denoted as G-RCAC3 and G-SP-STR, respectively, and the overall optimized and efficient model is named eAodeMar. The complete architecture is illustrated in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Schematic diagram of the proposed eAodeMar framework, which features the lightweight G-SP-STR and G-RCAC3 modules based on Ghost convolution.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778827-g002.tif">
<alt-text content-type="machine-generated">Flowchart diagram of a neural network architecture for object detection with three sections labeled Backbone, Neck, and Head. The Backbone processes a 640 by 640 by 3 image input through stacked modules including CBS, C3, G-RCAC3, and SPPF. The Neck combines modules like G-SP-STR and upsampling paths. The Head applies convolution (Conv) layers at three different scales, producing feature maps of sizes 20 by 20 by 54, 40 by 40 by 54, and 80 by 80 by 54. The output shows detected objects in an image of boats. A legend at the bottom defines module abbreviations.</alt-text>
</graphic></fig>
<p>As shown in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>, the lightweight redesign leads to a significant reduction in parameter count for both modules. Specifically, G-RCAC3 achieves a 21.65% reduction in parameters compared to the original RCAC3 module. Meanwhile, G-SP-STR, which processes the smaller-scale 512-channel feature maps in the neck network, undergoes a 13.04% parameter reduction. These reductions directly contribute to lower memory footprint and enhanced inference efficiency, making the model better suited for deployment on resource-constrained embedded platforms while preserving detection accuracy.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Module-wise comparison of parameters and computational complexity before and after lightweight optimization.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Component</th>
<th valign="middle" align="left">Module</th>
<th valign="middle" align="left">Channels of input, output</th>
<th valign="middle" align="left">FLOPs/G</th>
<th valign="middle" align="left">Variation</th>
<th valign="middle" align="left">Parameter</th>
<th valign="middle" align="left">Variation</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="2" align="left">Backbone network</td>
<td valign="middle" align="left">G-RCAC3</td>
<td valign="middle" rowspan="2" align="left">512,512</td>
<td valign="middle" align="left">67.20</td>
<td valign="middle" rowspan="2" align="left">&#x2193;0.30%</td>
<td valign="middle" align="left">0.93</td>
<td valign="middle" rowspan="2" align="left">&#x2193;21.65%</td>
</tr>
<tr>
<td valign="middle" align="left">RCAC3</td>
<td valign="middle" align="left">67.40</td>
<td valign="middle" align="left">1.19</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">Neck network</td>
<td valign="middle" align="left">G-SP-STR</td>
<td valign="middle" rowspan="2" align="left">256,256</td>
<td valign="middle" rowspan="4" align="left">66.90<break/>67.40</td>
<td valign="middle" rowspan="4" align="left">&#x2193;0.74%</td>
<td valign="middle" align="left">0.66</td>
<td valign="middle" rowspan="2" align="left">&#x2193;8.72%</td>
</tr>
<tr>
<td valign="middle" align="left">SP-STR</td>
<td valign="middle" align="left">0.73</td>
</tr>
<tr>
<td valign="middle" align="left">G-SP-STR</td>
<td valign="middle" rowspan="2" align="left">512,512</td>
<td valign="middle" align="left">1.72</td>
<td valign="middle" rowspan="2" align="left">&#x2193;13.04%</td>
</tr>
<tr>
<td valign="middle" align="left">SP-STR</td>
<td valign="middle" align="left">1.97</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From <xref ref-type="disp-formula" rid="eq1">Equations 1</xref> and <xref ref-type="disp-formula" rid="eq3">3</xref>, it can be derived that the reduction in model parameters is intrinsically linked to both the kernel size and the channel dimensions. When the number of input channels is substantial, this parameter-saving mechanism becomes particularly pronounced, highlighting the efficiency advantage of the adopted lightweight design.</p>
<p>As shown in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>, the variation in computational load reflects the impact of replacing individual modules within the overall network. It is noteworthy that, following the substitution with lightweight counterparts, the backbone network exhibits a relatively minor increase in computational overhead. This observation aligns with the mathematical relationships expressed in <xref ref-type="disp-formula" rid="eq2">Equations 2</xref> and <xref ref-type="disp-formula" rid="eq4">4</xref>, which indicate that reductions in model FLOPs are also strongly influenced by kernel dimensions and channel configurations.</p>
<p>Remark 2. Note that <xref ref-type="disp-formula" rid="eq5">Equation 5</xref> considers only convolutional operations and does not incorporate computational contributions from bias terms, activation functions such as SiLU, or other layer-wise operations. Therefore, the theoretical compression ratio presented here denotes idealized upper limits. In practical deployment scenarios, the actual achievable reductions tend to be less pronounced.</p>
</sec>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Embedded deployment pipeline of lightweight eAodeMar</title>
<sec id="s3_1">
<label>3.1</label>
<title>Deployment platform</title>
<p>As shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>, the deployment is conducted on the NVIDIA Jetson Xavier NX embedded GPU module, a compact platform (70 mm &#xd7; 45 mm) supporting configurable power modes up to 20 W and delivering peak performance of 21 TOPS. Key specifications are listed in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>. The module integrates a 6-core NVIDIA Carmel ARM CPU and a 384-core Volta GPU with 48 Tensor cores, paired with 8 GB of shared memory. It also includes dedicated accelerators for deep learning and vision tasks, along with programmable vision engines. Connectivity features comprise PCIE 3.0/4.0 interfaces for storage, a 16-lane MIPI CSI-2 camera interface, DP/HDMI outputs, and low-speed interfaces (IIC, SPI, UART and IIS) for peripheral integration, making it well-suited for real-time maritime vision applications.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Experimental devices and Jetson Xavier NX GPU.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778827-g003.tif">
<alt-text content-type="machine-generated">Workspace with a computer monitor displaying code and an image analysis program, a keyboard, and a small computer device outlined in red; a red dashed arrow points to a close-up view of the device&#x2019;s circuit board with a cooling fan on the right side.</alt-text>
</graphic></fig>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Hardware parameters of Jetson Xaiver NX.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Parameter</th>
<th valign="middle" align="center">Detailed description</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">AI Performance</td>
<td valign="middle" align="center">21TOPS (INT8) or 6.8TFLOPS (FP16)</td>
</tr>
<tr>
<td valign="middle" align="center">GPU</td>
<td valign="middle" align="center">384 CUDA cores and 48 Tensor Cores</td>
</tr>
<tr>
<td valign="middle" align="center">CPU</td>
<td valign="middle" align="center">6-core NVIDIA Carmel ARM<sup>&#xae;</sup>v8.2 64-bit CPU</td>
</tr>
<tr>
<td valign="middle" align="center">Memory</td>
<td valign="middle" align="center">8 GB 128-bit LPDDR4x</td>
</tr>
<tr>
<td valign="middle" align="center">Storage</td>
<td valign="middle" align="center">16 GB eMMC 5.1 flash memory</td>
</tr>
<tr>
<td valign="middle" align="center">Power</td>
<td valign="middle" align="center">10W/15W/20W</td>
</tr>
<tr>
<td valign="middle" align="center">Linux</td>
<td valign="middle" align="center">Ubuntu 18.04</td>
</tr>
<tr>
<td valign="middle" align="center">TensorRT</td>
<td valign="middle" align="center">TensorRT 7.1.3</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Deployment process</title>
<p>The deployment of the lightweight eAodeMar model onto the Jetson Xavier NX embedded GPU encompasses a structured optimization and acceleration pipeline designed to ensure efficient real-time inference. The overall workflow, executed within an Ubuntu 18.04environment updated with JetPack 4.6, involves environment configuration, model acceleration using TensorRT, and the final on-device inference implementation.</p>
<p>(a) TensorRT-based Model Acceleration.</p>
<p>Acceleration is primarily achieved through parameter quantization and graph-level structural optimization. First, network parameters are quantized from FP32 to FP16 precision, reducing memory bandwidth and computational demand while maintaining acceptable accuracy. Subsequently, TensorRT performs a comprehensive graph-level refactoring that merges compatible layers both vertically and horizontally to minimize the complexity of the execution graph.</p>
<p>As illustrated in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>, each layer in a typical inference graph invokes a separate CUDA kernel. Vertical fusion combines operations such as convolution, bias addition, and SiLU activation into a single C-B-S (Convolution-Bias-SiLU) module, thereby reducing the number of kernel launches from three to one. Horizontal fusion is applied to C-B-S layers that share the same input tensor and perform identical operations, further streamlining the computational graph. These optimizations collectively decrease latency and improve throughput, enabling the model to meet real-time performance requirements on the embedded platform.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>TensorRT interlayer integration strategy: <bold>(a)</bold> Original network. <bold>(b)</bold> Vertical fusion strategy. <bold>(c)</bold> Combined vertical-horizontal fusion strategy.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778827-g004.tif">
<alt-text content-type="machine-generated">Three-panel diagram illustrating the evolution of a neural network module. Panel (a) shows a structure with separate convolution, bias, and SiLU activation layers for each kernel size, followed by concatenation and output. Panel (b) simplifies the module by merging convolution, batch normalization, and SiLU into single blocks labeled C-B-S, and panel (c) highlights one of these blocks in blue, maintaining the maxpool and concatenation structure throughout.</alt-text>
</graphic></fig>
<p>(b) Implementation of the TensorRT Inference Framework.</p>
<p>The inference acceleration framework is realized through a sequential three-stage workflow: model conversion, engine construction, and runtime execution. First, the PyTorch-trained model (.pt) is exported to the Open Neural Network Exchange (ONNX) format (.onnx) to ensure framework interoperability. The ONNX file is then transferred to the Jetson Xavier NX, where a TensorRT builder parses the model graph. Within the CUDA environment, the builder applies kernel auto-selection, layer fusion, and FP16 quantization to generate a highly optimized execution plan. This plan is serialized into a portable TensorRT engine file (.engine). During inference, the engine is deserialized to restore the complete network definition, trained parameters, and pre-allocated activation buffers. Forward passes are executed with minimal overhead, producing output tensors that encode detected ship proposals. Finally, these outputs are decoded and processed through Non-Maximum Suppression (NMS) to eliminate redundant bounding boxes, completing the real-time ship detection for maritime video streams.</p>
</sec>
</sec>
<sec id="s4" sec-type="results">
<label>4</label>
<title>Results and analysis</title>
<sec id="s4_1">
<label>4.1</label>
<title>Implementation details and datasets</title>
<p>The experimental configuration is detailed in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>. To ensure a fair comparison, all models are trained, validated, and tested using an input resolution of <inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:mn>640</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>640</mml:mn></mml:mrow></mml:math></inline-formula> pixels. During training, a batch size of 16 was employed, and models were optimized for 300 epochs until convergence. Inference speed is reported in FPS, which accounts for the complete processing pipeline, including image pre-processing, forward pass, and NMS, rather than the forward pass alone.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>The training environment configuration.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Parameter</th>
<th valign="middle" align="center">Configuration</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Linux</td>
<td valign="middle" align="center">Ubuntu 18.04</td>
</tr>
<tr>
<td valign="middle" align="center">CPU</td>
<td valign="middle" align="center">Intel Core i7-8700K</td>
</tr>
<tr>
<td valign="middle" align="center">GPU</td>
<td valign="middle" align="center">GeForce RTX 2080Ti</td>
</tr>
<tr>
<td valign="middle" align="center">CUDA</td>
<td valign="middle" align="center">CUDA 10.2</td>
</tr>
<tr>
<td valign="middle" align="center">cuDNN</td>
<td valign="middle" align="center">cuDNN v8.3.2</td>
</tr>
<tr>
<td valign="middle" align="center">Algorithm framework</td>
<td valign="middle" align="center">PyTorch 1.7.1</td>
</tr>
<tr>
<td valign="middle" align="center">Optimizer</td>
<td valign="middle" align="center">SGD</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>All experiments are conducted on a publicly available maritime vessel detection dataset, MVDD13<xref ref-type="fn" rid="fn1"><sup>1</sup></xref> (<xref ref-type="bibr" rid="B46">Wang et&#xa0;al., 2024b</xref>). This dataset comprises 13 ship categories with precise bounding-box annotations. For all models, 25,541, 2,838 and 7,095 images are used for training, validation and testing, respectively. To further assess the practical applicability of the deployed model in real-world maritime scenarios, the most challenging on-board video sequences of SMD<xref ref-type="fn" rid="fn2"><sup>2</sup></xref> (<xref ref-type="bibr" rid="B32">Moosbauer et&#xa0;al., 2019</xref>) dataset and the video data captured by our USV operating in the Linghai campus wharf of Dalian. Sample frames are shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Example images from <bold>(a)</bold> SMD onboard video and <bold>(b)</bold> Linghai campus wharf video used for evaluation, featuring challenging scenarios with varied obstacles.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778827-g005.tif">
<alt-text content-type="machine-generated">Panel (a) shows four seascape photographs with cargo ships on the distant horizon and calm water in the foreground under a lightly clouded sky. Panel (b) displays four riverside urban photos: the first with docked sailboats and industrial structures, the second and fourth depicting waterfront buildings and trees, and the third featuring three tall modern buildings with a sunset backdrop.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Evaluation metrics</title>
<p>Given an IoU threshold, the Precision and Recall can be determined accordingly. To this end, the AP for each category can be calculated by</p>
<disp-formula id="eq7">
<mml:math display="block" id="M7"><mml:mrow><mml:mtext>AP</mml:mtext><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x222b;</mml:mo><mml:mn>0</mml:mn><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mstyle><mml:mtext>Pr</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mtext>Re</mml:mtext><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mtext>Re</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>where Pr and Re denote the Precision and Recall, respectively.</p>
<p>Accordingly, the mean average precision (mAP) for all categories in specific IoU thresholds, i.e., mAP@.5 and mAP@[.5:.95], can be obtained. In addition, the FPS<xref ref-type="fn" rid="fn3"><sup>3</sup></xref> can be calculated directly by the number of frames divided by time cost.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Ablation Studies</title>
<sec id="s4_3_1">
<label>4.3.1</label>
<title>Influence of the Two Lightweight Modules on Model Performance</title>
<p>The results are shown in <xref ref-type="table" rid="T6"><bold>Table&#xa0;6</bold></xref>. Compared with the original AodeMar model, the variant employing only the lightweight G-RCAC3 module exhibits a reduction in mAP@.5 of 3.14%, accompanied by moderate decreases in both parameter count and FLOPs, while achieving a 32.40% improvement in FPS. Similarly, the model using only the G-SP-STR module shows a smaller accuracy drop of 0.62% in mAP@.5, together with reduced model complexity, and attains a 35.24% gain in FPS.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Performance comparisons between AodeMar and its lightweight models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Metrics</th>
<th valign="middle" align="center">AodeMar</th>
<th valign="middle" align="center">+G-RCAC3</th>
<th valign="middle" align="center">+G-SP-STR</th>
<th valign="middle" align="center">eAodeMar</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">mAP@.5/%</td>
<td valign="middle" align="center">95.43</td>
<td valign="middle" align="center">95.13</td>
<td valign="middle" align="center">94.84</td>
<td valign="middle" align="center">95.03</td>
</tr>
<tr>
<td valign="middle" align="center">Total parameters/M</td>
<td valign="middle" align="center">8.28</td>
<td valign="middle" align="center">8.02</td>
<td valign="middle" align="center">7.96</td>
<td valign="middle" align="center">7.70</td>
</tr>
<tr>
<td valign="middle" align="center">FLOPs/G</td>
<td valign="middle" align="center">67.40</td>
<td valign="middle" align="center">67.20</td>
<td valign="middle" align="center">66.90</td>
<td valign="middle" align="center">66.80</td>
</tr>
<tr>
<td valign="middle" align="center">FPS</td>
<td valign="middle" align="center">61.73</td>
<td valign="middle" align="center">81.30</td>
<td valign="middle" align="center">83.33</td>
<td valign="middle" align="center">87.72</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x201c;+&#x201d; stands for C3 convolution is replaced by the module.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>When both lightweight modules are integrated simultaneously, the resulting eAodeMar model achieves the most favorable balance across all evaluated metrics. Specifically, it maintains a nearly identical detection accuracy, with only a 0.42% decrease in mAP@.5, while substantially reducing computational complexity. Most notably, the inference speed is increased by nearly 26 FPS, corresponding to a 42.12% improvement relative to the baseline. These results validate the effectiveness of the proposed Ghost convolution-based lightweight design in enhancing real-time performance with minimal compromise in detection accuracy.</p>
</sec>
<sec id="s4_3_2">
<label>4.3.2</label>
<title>Influence of lightweight backbones on model performance</title>
<p>To systematically evaluate the impact of lightweight backbone networks, we conducted an ablation study comparing several well-established lightweight architectures: GhostNet (<xref ref-type="bibr" rid="B9">Han et&#xa0;al., 2020</xref>), ShuffleNetv2 (<xref ref-type="bibr" rid="B28">Ma et&#xa0;al., 2018</xref>), MobileNetv3 ep2020Searching, PP-LCNet (<xref ref-type="bibr" rid="B4">Cui et&#xa0;al., 2021</xref>), and EfficientNetv2 (<xref ref-type="bibr" rid="B41">Tan and Le, 2021</xref>).</p>
<p>The comparative results are presented in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>. Among all lightweight backbone variants, eAodeMar achieves the second-highest inference speed, surpassed only by A-PP-LCNet. However, its advantages in parameter count and FLOPs are less pronounced. The eAodeMar ranks second in parameter efficiency (after A-GhostNetv2) while exhibiting approximately 10% higher FLOPs. Notably, eAodeMar keeps a clear lead in detection accuracy over all lightweight backbone alternatives. Within the lightweight backbone group, A-PP-LCNet exhibits the lowest parameter count and FLOPs, resulting in the fastest inference speed. A-EfficientNet achieves the highest detection accuracy in both mAP@.5 and mAP@[.5:.95], closely followed by A-GhostNetv2. In terms of FPS, A-PP-LCNet ranks first, with A-EfficientNet in second place.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Comparisons among eAodeMar, AodeMar and models with lightweight backbones.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Parameters/M</th>
<th valign="middle" align="center">FLOPs/G</th>
<th valign="middle" align="center">mAP@.5/%</th>
<th valign="middle" align="center">mAP@[.5:.95]/%</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">A-GhostNetv2</td>
<td valign="middle" align="center">7.46</td>
<td valign="middle" align="center">59.4</td>
<td valign="middle" align="center">92.57</td>
<td valign="middle" align="center">81.73</td>
<td valign="middle" align="center">69.93</td>
</tr>
<tr>
<td valign="middle" align="center">A-ShuffleNetv2</td>
<td valign="middle" align="center">5.81</td>
<td valign="middle" align="center">57.6</td>
<td valign="middle" align="center">90.77</td>
<td valign="middle" align="center">78.07</td>
<td valign="middle" align="center">75.19</td>
</tr>
<tr>
<td valign="middle" align="center">A-MobileNetv3</td>
<td valign="middle" align="center">6.15</td>
<td valign="middle" align="center">57.9</td>
<td valign="middle" align="center">91.79</td>
<td valign="middle" align="center">79.87</td>
<td valign="middle" align="center">51.68</td>
</tr>
<tr>
<td valign="middle" align="center">A-PP-LCNet</td>
<td valign="middle" align="center">5.63</td>
<td valign="middle" align="center">57.6</td>
<td valign="middle" align="center">91.64</td>
<td valign="middle" align="center">79.44</td>
<td valign="middle" align="center">88.89</td>
</tr>
<tr>
<td valign="middle" align="center">A-EfficientNet</td>
<td valign="middle" align="center">6.18</td>
<td valign="middle" align="center">59.0</td>
<td valign="middle" align="center">92.60</td>
<td valign="middle" align="center">81.77</td>
<td valign="middle" align="center">78.43</td>
</tr>
<tr>
<td valign="middle" align="center">AodeMar</td>
<td valign="middle" align="center">8.28</td>
<td valign="middle" align="center">67.4</td>
<td valign="middle" align="center">95.43</td>
<td valign="middle" align="center">82.57</td>
<td valign="middle" align="center">61.73</td>
</tr>
<tr>
<td valign="middle" align="center">eAodeMar</td>
<td valign="middle" align="center">7.70</td>
<td valign="middle" align="center">66.8</td>
<td valign="middle" align="center">95.03</td>
<td valign="middle" align="center">84.53</td>
<td valign="middle" align="center">87.72</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x201c;A-X&#x201d;denotes AodeMar with backbone replaced by X.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Compared to the original AodeMar model, all lightweight backbones substantially reduce parameter count and computational complexity (with the exception of MobileNetv3) and significantly improve FPS. For example, A-PP-LCNet increases FPS by 44.00%, while A-EfficientNet achieves a 27.05% improvement. However, replacing the backbone with any lightweight architecture consistently leads to a considerable drop in detection accuracy. The smallest accuracy degradation is observed with A-EfficientNet, which exhibits reductions of 2.97% in mAP@.5 and 0.97% in mAP@[.5:.95].</p>
</sec>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Comparison experimental results</title>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Comparison with state-of-the-art lightweight models</title>
<p>To validate the superiority of the proposed eAodeMar model, we conduct a comparative evaluation against several mainstream lightweight object detectors, including YOLOv3-Tiny (<xref ref-type="bibr" rid="B1">Adarsh et&#xa0;al., 2020</xref>), YOLOv4-Tiny (<xref ref-type="bibr" rid="B44">Wang et&#xa0;al., 2021</xref>), YOLOv5-Lite, and YOLOv7-Tiny.</p>
<p>The comparison results are shown in <xref ref-type="table" rid="T8"><bold>Table&#xa0;8</bold></xref>. It can be observed that existing lightweight models generally exhibit lower parameter counts and FLOPs. In particular, YOLOv5-Lite uses only about 20% of the parameters and 5% of the FLOPs compared to AodeMar/eAodeMar. However, its inference speed exceeds eAodeMar by only 12.28 FPS (a 14% improvement), while suffering a significant degradation in detection accuracy, an 8% drop in mAP@[.5:.95] and a 14.3% drop in mAP@.5.</p>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>Quantitative comparisons with typical lightweight models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Parameters/M</th>
<th valign="middle" align="center">FLOPs/G</th>
<th valign="middle" align="center">mAP@.5/%</th>
<th valign="middle" align="center">mAP@[.5:.95]/%</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">YOLOv3-Tiny</td>
<td valign="middle" align="center">4.91</td>
<td valign="middle" align="center">5.61</td>
<td valign="middle" align="center">78.44</td>
<td valign="middle" align="center">43.52</td>
<td valign="middle" align="center">200.04</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv4-Tiny</td>
<td valign="middle" align="center">5.90</td>
<td valign="middle" align="center">16.22</td>
<td valign="middle" align="center">83.01</td>
<td valign="middle" align="center">52.36</td>
<td valign="middle" align="center">179.15</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5-Lite</td>
<td valign="middle" align="center">1.56</td>
<td valign="middle" align="center">3.80</td>
<td valign="middle" align="center">87.34</td>
<td valign="middle" align="center">70.23</td>
<td valign="middle" align="center">100.00</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv7-Tiny</td>
<td valign="middle" align="center">6.23</td>
<td valign="middle" align="center">13.86</td>
<td valign="middle" align="center">90.56</td>
<td valign="middle" align="center">78.46</td>
<td valign="middle" align="center">83.86</td>
</tr>
<tr>
<td valign="middle" align="center">AodeMar</td>
<td valign="middle" align="center">8.28</td>
<td valign="middle" align="center">67.40</td>
<td valign="middle" align="center">95.43</td>
<td valign="middle" align="center">82.57</td>
<td valign="middle" align="center">61.73</td>
</tr>
<tr>
<td valign="middle" align="center">eAodeMar</td>
<td valign="middle" align="center">7.70</td>
<td valign="middle" align="center">66.8</td>
<td valign="middle" align="center">95.03</td>
<td valign="middle" align="center">84.53</td>
<td valign="middle" align="center">87.72</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Furthermore, the results reveal that parameter count and FLOPs are not strictly proportional to actual inference speed. Although eAodeMar has relatively higher parameter and FLOP values, it still achieves faster inference than the widely adopted YOLOv7-Tiny model. These comparisons demonstrate that eAodeMar maintains a clear advantage in detection accuracy while exhibiting competitive inference efficiency, with notable potential for further speed optimization.</p>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Performance comparison after TensorRT acceleration</title>
<p>For convenience, the TensorRT-accelerated model is denoted as tAodeMar. In this study, both the baseline AodeMar and the lightweight eAodeMar are initially deployed on the Jetson Xavier NX platform using the PyTorch framework for reference performance evaluation.</p>
<p>As shown in <xref ref-type="table" rid="T9"><bold>Table&#xa0;9</bold></xref>, we compare the stage-wise detection accuracy, inference speed, and time consumption of AodeMar, eAodeMar and tAodeMar models on the MVDD13 image test set. Compared to the original AodeMar running on the embedded platform, eAodeMar reduces the time of all three phases (pre-processing, inference, NMS), raising an overall speed improvement of 4.5%. Following TensorRT acceleration, the optimized tAodeMar model achieves a 61.56% reduction in inference time, effectively doubling the overall detection speed to 37.45 FPS. Although a marginal decrease of 0.23% in mAP@0.5 is observed, this minimal accuracy loss strongly validates the practical efficacy of the tAodeMar model, confirming that significant acceleration can be attained with negligible degradation in detection performance.</p>
<table-wrap id="T9" position="float">
<label>Table&#xa0;9</label>
<caption>
<p>Performance comparison of models before and after deployment.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">mAP@.5/%</th>
<th valign="middle" align="center">Pre-processing/ms</th>
<th valign="middle" align="center">Inference/ms</th>
<th valign="middle" align="center">NMS/ms</th>
<th valign="middle" align="center">FPS (Jetson)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">AodeMar (PyTorch)</td>
<td valign="middle" align="center">95.40</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">60.2</td>
<td valign="middle" align="center">3.4</td>
<td valign="middle" align="center">15.31</td>
</tr>
<tr>
<td valign="middle" align="center">eAodeMar (PyTorch)</td>
<td valign="middle" align="center">95.09</td>
<td valign="middle" align="center">1.5</td>
<td valign="middle" align="center">58.0</td>
<td valign="middle" align="center">3.0</td>
<td valign="middle" align="center">16.00</td>
</tr>
<tr>
<td valign="middle" align="center">tAodeMar (TensorRT-FP16)</td>
<td valign="middle" align="center">94.87</td>
<td valign="middle" align="center">1.4</td>
<td valign="middle" align="center">22.3</td>
<td valign="middle" align="center">3.0</td>
<td valign="middle" align="center">37.45</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To thoroughly evaluate the real-time performance of these models under practical deployment conditions, we conducted inference tests on two distinct maritime video sequences: (1) a real-world video captured by the our USV operating in the Linghai campus wharf of Dalian, and (2) the publicly available SMD onboard ship video.</p>
<p>As summarized in <xref ref-type="table" rid="T10"><bold>Table&#xa0;10</bold></xref>, the AodeMar, eAodeMar, and tAodeMar models achieved inference speeds of 3.60, 5.76 and 18.31 FPS, respectively, on the real-world wharf video, and 6.93, 7.33 and 28.57 FPS on the SMD video. These results demonstrate a marked improvement in detection speed following TensorRT acceleration. While the tAodeMar model attains 28.57 FPS on the SMD video, clearly satisfying the common real-time threshold of <inline-formula>
<mml:math display="inline" id="im32"><mml:mrow><mml:mo>&#x2265;</mml:mo><mml:mn>20</mml:mn></mml:mrow></mml:math></inline-formula> FPS, its performance on the more complex real-world video (18.31 FPS) slightly falls below this threshold. As illustrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, the discrepancy is largely attributable to differences in scene complexity, the wharf environment contains richer background clutter and finer details compared to the open-sea scenes in the SMD video. Such complexity tends to generate a higher number of candidate proposals during detection, thereby increasing computational overhead and reducing frame-rate. In total, the TensorRT-accelerated model (tAodeMar) achieves a substantial boost in inference speed while confining the loss in detection accuracy to an acceptable margin. These outcomes validate both the effectiveness of the proposed lightweight-acceleration pipeline and its practical applicability in real-world maritime perception scenarios.</p>
<table-wrap id="T10" position="float">
<label>Table&#xa0;10</label>
<caption>
<p>Comparison of detection speed before and after acceleration on the actual collected and SMD video streams.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Video data</th>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Pre-processing/ms</th>
<th valign="middle" align="center">Inference/ms</th>
<th valign="middle" align="center">NMS/ms</th>
<th valign="middle" align="center">FPS (Jetson)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="3" align="center">Actual collected video</td>
<td valign="middle" align="center">AodeMar (PyTorch)</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">60.2</td>
<td valign="middle" align="center">3.4</td>
<td valign="middle" align="center">15.31</td>
</tr>
<tr>
<td valign="middle" align="center">eAodeMar (PyTorch)</td>
<td valign="middle" align="center">1.5</td>
<td valign="middle" align="center">58.0</td>
<td valign="middle" align="center">3.0</td>
<td valign="middle" align="center">16.00</td>
</tr>
<tr>
<td valign="middle" align="center">tAodeMar (TensorRT-FP16)</td>
<td valign="middle" align="center">1.4</td>
<td valign="middle" align="center">22.3</td>
<td valign="middle" align="center">3.0</td>
<td valign="middle" align="center">37.45</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">SMD</td>
<td valign="middle" align="center">AodeMar (PyTorch)</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">60.2</td>
<td valign="middle" align="center">3.4</td>
<td valign="middle" align="center">15.31</td>
</tr>
<tr>
<td valign="middle" align="center">eAodeMar (PyTorch)</td>
<td valign="middle" align="center">1.5</td>
<td valign="middle" align="center">58.0</td>
<td valign="middle" align="center">3.0</td>
<td valign="middle" align="center">16.00</td>
</tr>
<tr>
<td valign="middle" align="center">tAodeMar(TensorRT-FP16)</td>
<td valign="middle" align="center">1.4</td>
<td valign="middle" align="center">22.3</td>
<td valign="middle" align="center">3.0</td>
<td valign="middle" align="center">37.45</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To qualitatively compare the visual detection results before and after acceleration, four representative frames are selected from the real-world video, covering both head-light (row 1) and back-light (rows 2-4) illumination scenarios, as well as challenging scenarios involving occlusion and partially visible objects. The detection results are shown in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>. In the head-light occlusion scene (row 1), both models successfully detect two heavily occluded vessels, indicating that the lightweight design retains the ability to recognize targets under severe occlusion. However, due to the low illumination associated with the head-light setting, a false positive occurs, i.e., a building with windows extending over the water on the left pier is incorrectly classified as a passenger ship. This is attributed to the reduced model complexity of the lightweight design, which slightly weakens feature extraction capability, leading to occasional false and missed detections. For the two clearly visible targets in row 2 and the partially visible one on the right in row 3, both models achieve correct identification and localization with high confidence. In contrast, for the partially visible sailboats on the left in rows 3 and 4, despite their relatively large area within the frame, both models yield low confidence scores, while the eAodeMar model even misses the target entirely. This observation indicates that detection performance is not directly proportional to the objects image size. Instead, the detection outcome depends more strongly on the completeness and distinctiveness of salient structural features (e.g., sails or masts), a behavior that aligns well with human visual recognition mechanisms.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Comparison of detection results on challenging scenes from the Linghai campus wharf. <bold>(a)</bold> Four selected original video frames, featuring both glare (row 1) and backlighting (rows 2-4) scenarios with significant occlusion and partial targets. <bold>(b)</bold> and <bold>(c)</bold> are the corresponding detection results from the AodeMar and our proposed eAodeMar, respectively.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778827-g006.tif">
<alt-text content-type="machine-generated">Nine-panel comparison of object detection on waterfront scenes, showing different stages as a boat approaches a dock. Columns labeled (a), (b), and (c) each display four sequential frames; (b) and (c) include overlays with green or red boxes and confidence scores for detected objects like &#x201c;sailingboat&#x201d; and &#x201c;passenger.&#x201d;.</alt-text>
</graphic></fig>
<p>In total, the eAodeMar model illustrates the capability to accurately localize and rapidly identify vessels in complex maritime environments. While a modest trade-off in detection accuracy is observed, the model achieves a substantial enhancement in inference speed. For real-world applications that necessitate a balanced compromise between detection performance and processing efficiency, eAodeMar presents a highly viable and practical solution.</p>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion and future work</title>
<p>To enable the practical deployment of an occluded ship detection model on embedded GPUs, this paper proposes eAodeMar, a lightweight ship detection model constructed using Ghost convolution-based modules. Compared to the original AodeMar model, eAodeMar achieves a significant improvement in detection speed with only a marginal decrease in detection accuracy. Furthermore, the eAodeMar model was deployed on the Jetson Xavier NX embedded GPU, where TensorRT was employed for model optimization and acceleration. Performance was evaluated on the MVDD13 test set, video data collected in the Haitou harbor basin, and the SMD video dataset. The results show that the TensorRT-optimized model substantially reduces inference latency, leading to a remarkable increase in detection speed. Specifically, the optimized model achieves 37.45 FPS on the test set and 28.57 FPS on the SMD video, representing a significant improvement over both AodeMar and the non-accelerated eAodeMar. This successfully meets the initial deployment objective of enhancing detection speed while considerably lowering the hardware requirements of the model. Future work will focus on (1) advanced inference strategies like cascaded detection, adaptive mechanisms for complex scenarios; (2) lightweight attention and contextual modeling to better recognize partially visible targets; and (3) enhanced robustness under diverse, extreme maritime conditions, to build upon this efficient baseline for next-generation autonomous marine vision systems.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author/s.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>YW: Data curation, Conceptualization, Validation, Methodology, Writing &#x2013; review &amp; editing, Investigation, Formal analysis, Writing &#x2013; original draft, Supervision, Funding acquisition, Software. MW: Supervision, Writing &#x2013; review &amp; editing. JS: Writing &#x2013; review &amp; editing, Investigation, Funding acquisition. ZN: Conceptualization, Writing &#x2013; review &amp; editing, Investigation, Software. GL: Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Data curation.</p></sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>Author JS was employed by the company Weihai Guangtai Airport Equipment Co., Ltd.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Adarsh</surname> <given-names>P.</given-names></name>
<name><surname>Rathi</surname> <given-names>P.</given-names></name>
<name><surname>Kumar</surname> <given-names>M.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>YOLOv3-Tiny: Object detection and recognition using one stage improved model</article-title>,&#x201d; in <conf-name>2020 6th International Conference on Advanced Computing and Communication Systems</conf-name>, (<publisher-loc>Coimbatore, India</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>687</fpage>&#x2013;<lpage>694</lpage>.
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Cheng</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>D.</given-names></name>
<name><surname>Zhou</surname> <given-names>P.</given-names></name>
<name><surname>Zhang</surname> <given-names>T</given-names></name>
</person-group>. (<year>2018</year>). 
<article-title>Model compression and acceleration for deep neural networks: The principles, progress, and challenges</article-title>, 
<publisher-name>IEEE Signal Processing Magazine</publisher-name>), <volume>35</volume>, <fpage>126</fpage>&#x2013;<lpage>136</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/MSP.2017.2765695</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Chollet</surname> <given-names>F.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Xception: Deep learning with depthwise separable convolutions</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Honolulu, HI, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>1800</fpage>&#x2013;<lpage>1807</lpage>.
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Cui</surname> <given-names>C.</given-names></name>
<name><surname>Gao</surname> <given-names>T.</given-names></name>
<name><surname>Wei</surname> <given-names>S.</given-names></name>
<name><surname>Du</surname> <given-names>Y.</given-names></name>
<name><surname>Guo</surname> <given-names>R.</given-names></name>
<name><surname>Dong</surname> <given-names>S.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;
<article-title>PP-LCNet: A lightweight CPU convolutional neural network</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision Workshops</conf-name>, (<publisher-loc>Montreal, QC, Canada</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>356</fpage>&#x2013;<lpage>365</lpage>.
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Deng</surname> <given-names>L.</given-names></name>
<name><surname>Li</surname> <given-names>G.</given-names></name>
<name><surname>Han</surname> <given-names>S.</given-names></name>
<name><surname>Shi</surname> <given-names>L.</given-names></name>
<name><surname>Xie</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Model compression and hardware acceleration for neural networks: A comprehensive survey</article-title>. <source>Proc. IEEE</source> <volume>108</volume>, <fpage>485</fpage>&#x2013;<lpage>532</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JPROC.2020.2976475</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gao</surname> <given-names>H.</given-names></name>
<name><surname>Tian</surname> <given-names>Y.</given-names></name>
<name><surname>Xu</surname> <given-names>F.</given-names></name>
<name><surname>Zhong</surname> <given-names>S.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Survey of deep learning model compression and acceleration</article-title>. <source>J. Software</source> <volume>32</volume>, <issue>1</issue>, <fpage>68</fpage>&#x2013;<lpage>92</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13328/j.cnki.jos.006096</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gao</surname> <given-names>H.</given-names></name>
<name><surname>Wang</surname> <given-names>C.</given-names></name>
<name><surname>Niu</surname> <given-names>R.</given-names></name>
<name><surname>Fang</surname> <given-names>X.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Sun</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Driving risk assessment for intelligent vehicles based on entropy-informed graph neural networks and gaussian distributions</article-title>. <source>IEEE Trans. Neural Networks Learn. Syst.</source> <volume>36</volume>, <fpage>16478</fpage>&#x2013;<lpage>16491</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TNNLS.2025.3569826</pub-id>, PMID: <pub-id pub-id-type="pmid">40445818</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Haijoub</surname> <given-names>A.</given-names></name>
<name><surname>Hatim</surname> <given-names>A.</given-names></name>
<name><surname>Guerrero-Gonzalez</surname> <given-names>A.</given-names></name>
<name><surname>Arioua</surname> <given-names>M.</given-names></name>
<name><surname>Chougdali</surname> <given-names>K.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Enhanced YOLOv8 ship detection empower unmanned surface vehicles for advanced maritime surveillance</article-title>. <source>J. Imaging</source> <volume>10</volume>, <fpage>303</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jimaging10120303</pub-id>, PMID: <pub-id pub-id-type="pmid">39728200</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Han</surname> <given-names>K.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Tian</surname> <given-names>Q.</given-names></name>
<name><surname>Guo</surname> <given-names>J.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>GhostNet: More features from cheap operations</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Seattle, WA, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>1577</fpage>&#x2013;<lpage>1586</lpage>.
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Han</surname> <given-names>S.</given-names></name>
<name><surname>Pool</surname> <given-names>J.</given-names></name>
<name><surname>Tran</surname> <given-names>J.</given-names></name>
<name><surname>Dally</surname> <given-names>W. J.</given-names></name>
</person-group> (<year>2015</year>). &#x201c;
<article-title>Learning both weights and connections for efficient neural networks</article-title>,&#x201d; in <conf-name>Proceedings of the 29th International Conference on Neural Information Processing Systems</conf-name>, (<publisher-loc>Barcelona, Spain</publisher-loc>: 
<publisher-name>Curran Associates, Inc</publisher-name>). <fpage>1135</fpage>&#x2013;<lpage>1143</lpage>.
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hinton</surname> <given-names>G. E.</given-names></name>
<name><surname>Srivastava</surname> <given-names>N.</given-names></name>
<name><surname>Krizhevsky</surname> <given-names>A.</given-names></name>
<name><surname>Sutskever</surname> <given-names>I.</given-names></name>
<name><surname>Salakhutdinov</surname> <given-names>R. R.</given-names></name>
</person-group> (<year>2012</year>). 
<article-title>Improving neural networks by preventing co-adaptation of feature detectors</article-title>. <source>Comput. Sci.</source> <volume>3</volume><issue>4</issue>, <fpage>212</fpage>&#x2013;<lpage>223</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.9774/GLEAF.978-1-909493-38-4_2</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Hou</surname> <given-names>Q.</given-names></name>
<name><surname>Zhou</surname> <given-names>D.</given-names></name>
<name><surname>Feng</surname> <given-names>J.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Coordinate attention for efficient mobile network design</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Nashville, TN, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>13708</fpage>&#x2013;<lpage>13717</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01350</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Howard</surname> <given-names>A.</given-names></name>
<name><surname>Sandler</surname> <given-names>M.</given-names></name>
<name><surname>Chen</surname> <given-names>B.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Chen</surname> <given-names>L. C.</given-names></name>
<name><surname>Tan</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). &#x201c;
<article-title>Searching for mobilenetv3</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>, (<publisher-loc>Seoul, Korea (South)</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>1314</fpage>&#x2013;<lpage>1324</lpage>.
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Howard</surname> <given-names>A.</given-names></name>
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Chen</surname> <given-names>B.</given-names></name>
<name><surname>Kalenichenko</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Weyand</surname> <given-names>T.</given-names></name>
<etal/>
</person-group>. (<year>2017</year>). 
<article-title>MobileNets: Efficient convolutional neural networks for mobile vision applications</article-title>. <source>ArXiv</source>. [Preprint]. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1704.04861</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hu</surname> <given-names>J.</given-names></name>
<name><surname>Shen</surname> <given-names>L.</given-names></name>
<name><surname>Albanie</surname> <given-names>S.</given-names></name>
<name><surname>Sun</surname> <given-names>G.</given-names></name>
<name><surname>Wu</surname> <given-names>E.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Squeeze-and-excitation networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>42</volume>, <fpage>2011</fpage>&#x2013;<lpage>2023</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2019.2913372</pub-id>, PMID: <pub-id pub-id-type="pmid">31034408</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Huang</surname> <given-names>R.</given-names></name>
<name><surname>Pedoeem</surname> <given-names>J.</given-names></name>
<name><surname>Chen</surname> <given-names>C.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>YOLO-LITE: A real-time object detection algorithm optimized for non-GPU computers</article-title>,&#x201d; in <conf-name>2018 IEEE International Conference on Big Data</conf-name>, (<publisher-loc>Seattle, WA, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>2503</fpage>&#x2013;<lpage>2510</lpage>.
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Kawahara</surname> <given-names>T.</given-names></name>
<name><surname>Toda</surname> <given-names>S.</given-names></name>
<name><surname>Mikami</surname> <given-names>A.</given-names></name>
<name><surname>Tanabe</surname> <given-names>M.</given-names></name>
</person-group> (<year>2012</year>). &#x201c;
<article-title>Automatic ship recognition robust against aspect angle changes and occlusions</article-title>,&#x201d; in <conf-name>Proceedings of IEEE National Radar Conference</conf-name>, (<publisher-loc>Atlanta, GA, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>864</fpage>&#x2013;<lpage>869</lpage>.
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kortylewski</surname> <given-names>A.</given-names></name>
<name><surname>Liu</surname> <given-names>Q.</given-names></name>
<name><surname>Wang</surname> <given-names>A.</given-names></name>
<name><surname>Sun</surname> <given-names>Y.</given-names></name>
<name><surname>Yuille</surname> <given-names>A.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Compositional convolutional neural networks: A robust and interpretable model for object recognition under occlusion</article-title>. <source>Int. J. Comput. Vision</source> <volume>129</volume>, <fpage>736</fpage>&#x2013;<lpage>760</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11263-020-01401-3</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Kortylewski</surname> <given-names>A.</given-names></name>
<name><surname>Liu</surname> <given-names>Q.</given-names></name>
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<name><surname>Yuille</surname> <given-names>A.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Combining compositional models and deep networks for robust object classification under occlusion</article-title>,&#x201d; in <conf-name>Workshop on Applications of Computer Vision</conf-name>, (<publisher-loc>Snowmass, CO, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>1322</fpage>&#x2013;<lpage>1330</lpage>.
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lecun</surname> <given-names>Y.</given-names></name>
<name><surname>Bengio</surname> <given-names>Y.</given-names></name>
<name><surname>Hinton</surname> <given-names>G. E.</given-names></name>
</person-group> (<year>2015</year>). 
<article-title>Deep learning</article-title>. <source>Nature</source> <volume>521</volume>, <fpage>436</fpage>&#x2013;<lpage>444</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/nature14539</pub-id>, PMID: <pub-id pub-id-type="pmid">26017442</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lecun</surname> <given-names>Y.</given-names></name>
<name><surname>Bottou</surname> <given-names>L.</given-names></name>
<name><surname>Bengio</surname> <given-names>Y.</given-names></name>
<name><surname>Haffner</surname> <given-names>P.</given-names></name>
</person-group> (<year>1998</year>). 
<article-title>Gradient-based learning applied to document recognition</article-title>. <source>Proc. IEEE</source> <volume>86</volume>, <fpage>2278</fpage>&#x2013;<lpage>2324</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/5.726791</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Kadav</surname> <given-names>A.</given-names></name>
<name><surname>Durdanovic</surname> <given-names>I.</given-names></name>
<name><surname>Samet</surname> <given-names>H.</given-names></name>
<name><surname>Graf</surname> <given-names>H. P.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Pruning filters for efficient convnets</article-title>,&#x201d; in <conf-name>Proceedings of the International Conference on Learning Representation</conf-name>, (<publisher-loc>Hobart, Tasmania, Australia</publisher-loc>: 
<publisher-name>Curran Associates, Inc</publisher-name>). <fpage>24</fpage>&#x2013;<lpage>26</lpage>.
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>S.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>EGM-YOLOv8: A lightweight ship detection model with efficient feature fusion and attention mechanisms</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>13</volume>, <elocation-id>757</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse13040757</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Lin</surname> <given-names>Y.</given-names></name>
<name><surname>Cao</surname> <given-names>Y.</given-names></name>
<name><surname>Hu</surname> <given-names>H.</given-names></name>
<name><surname>Wei</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>b). &#x201c;
<article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proc. IEEE Int. Conf. Comput. Vis</conf-name>, (<publisher-loc>Montreal, Canada</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>9992</fpage>&#x2013;<lpage>10002</lpage>.
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>J.</given-names></name>
<name><surname>Sun</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>C.</given-names></name>
<name><surname>Hai</surname> <given-names>Z.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Real-time ship detection in maritime environments using an efficient feature fusion network on embedded GPUs</article-title>. <source>Ocean Eng.</source> <volume>266</volume>, <fpage>113074</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.oceaneng.2022.113074</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>T.</given-names></name>
<name><surname>Pang</surname> <given-names>B.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Yang</surname> <given-names>W.</given-names></name>
<name><surname>Sun</surname> <given-names>X.</given-names></name>
</person-group> (<year>2021</year>a). 
<article-title>Sea surface object detection algorithm based on YOLOv4 fused with reverse depthwise separable convolution (RDSC) for USV</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>9</volume>, <fpage>753</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse9070753</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>Q.</given-names></name>
<name><surname>Xiang</surname> <given-names>X.</given-names></name>
<name><surname>Yang</surname> <given-names>S.</given-names></name>
<name><surname>Huang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhu</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Intelligent decision and planning for unmanned surface vehicle: A review of machine learning techniques</article-title>. <source>Ocean Eng.</source> <volume>327</volume>, <elocation-id>120968</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.oceaneng.2025.120968</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Ma</surname> <given-names>N.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Zheng</surname> <given-names>H. T.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>ShuffleNet V2: Practical guidelines for efficient CNN architecture design</article-title>,&#x201d; in <conf-name>Proceedings of the European Conference on Computer Vision</conf-name>, (<publisher-loc>Munich, Germany</publisher-loc>: 
<publisher-name>Springer</publisher-name>). <fpage>122</fpage>&#x2013;<lpage>138</lpage>.
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Maki</surname> <given-names>A.</given-names></name>
<name><surname>Fukui</surname> <given-names>K.</given-names></name>
<name><surname>Kawawada</surname> <given-names>Y.</given-names></name>
<name><surname>Kiya</surname> <given-names>M.</given-names></name>
</person-group> (<year>2002</year>). &#x201c;
<article-title>Automatic ship identification in ISAR imagery: A non-line system using CMSM</article-title>,&#x201d; in <conf-name>Proceedings of the 2002 IEEE Radar Conference</conf-name>, (<publisher-loc>Long Beach, CA, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>206</fpage>&#x2013;<lpage>211</lpage>.
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Martin-Salinas</surname> <given-names>I.</given-names></name>
<name><surname>Badia</surname> <given-names>J. M.</given-names></name>
<name><surname>Valls</surname> <given-names>O.</given-names></name>
<name><surname>Leon</surname> <given-names>G.</given-names></name>
<name><surname>Del Amor</surname> <given-names>R.</given-names></name>
<name><surname>Belloch</surname> <given-names>J. A.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Evaluating and accelerating vision transformers on GPU-based embedded edge AI systems</article-title>. <source>J. Supercomputing</source> <volume>81</volume>, <fpage>1</fpage>&#x2013;<lpage>21</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11227-024-06807-1</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Molchanov</surname> <given-names>P.</given-names></name>
<name><surname>Tyree</surname> <given-names>S.</given-names></name>
<name><surname>Karras</surname> <given-names>T.</given-names></name>
<name><surname>Aila</surname> <given-names>T.</given-names></name>
<name><surname>Kautz</surname> <given-names>J.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Pruning convolutional neural networks for resource efficient inference</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations</conf-name>, (<publisher-loc>Toulon, France</publisher-loc>: 
<publisher-name>Curran Associates, Inc</publisher-name>). <fpage>1</fpage>&#x2013;<lpage>17</lpage>.
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Moosbauer</surname> <given-names>S.</given-names></name>
<name><surname>K&#xf6;nig</surname> <given-names>D.</given-names></name>
<name><surname>J&#xe4;kel</surname> <given-names>J.</given-names></name>
<name><surname>Teutsch</surname> <given-names>M.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>A benchmark for deep learning based object detection in maritime environments</article-title>,&#x201d; in <conf-name>EEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Long Beach, CA, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>916</fpage>&#x2013;<lpage>925</lpage>.
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ong</surname> <given-names>J.</given-names></name>
<name><surname>Vo</surname> <given-names>B.-T.</given-names></name>
<name><surname>Vo</surname> <given-names>B.-N.</given-names></name>
<name><surname>Kim</surname> <given-names>D. Y.</given-names></name>
<name><surname>Nordholm</surname> <given-names>S.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>A bayesian filter for multiview 3D multi-object tracking with occlusion handling</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>44</volume>, <fpage>2246</fpage>&#x2013;<lpage>2263</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2020.3034435</pub-id>, PMID: <pub-id pub-id-type="pmid">33112741</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Ranftl</surname> <given-names>R.</given-names></name>
<name><surname>Bochkovskiy</surname> <given-names>A.</given-names></name>
<name><surname>Koltun</surname> <given-names>V.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Vision transformers for dense prediction</article-title>,&#x201d; in <conf-name>IEEE International Conference on Computer Vision</conf-name>, (<publisher-loc>Montreal, Canada</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>12159</fpage>&#x2013;<lpage>12168</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV48922.2021.01196</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rensink</surname> <given-names>R. A.</given-names></name>
<name><surname>Enns</surname> <given-names>J. T.</given-names></name>
</person-group> (<year>1998</year>). 
<article-title>Early completion of occluded objects</article-title>. <source>Vision Res.</source> <volume>38</volume>, <fpage>2489</fpage>&#x2013;<lpage>2505</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0042-6989(98)00051-0</pub-id>, PMID: <pub-id pub-id-type="pmid">9798011</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Sandler</surname> <given-names>M.</given-names></name>
<name><surname>Howard</surname> <given-names>A.</given-names></name>
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Zhmoginov</surname> <given-names>A.</given-names></name>
<name><surname>Chen</surname> <given-names>L. C.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>MobileNetV2: Inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Salt Lake City, UT, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>.
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Sankaran</surname> <given-names>R. K.</given-names></name>
<name><surname>Paliwal</surname> <given-names>S.</given-names></name>
<name><surname>Pakrashi</surname> <given-names>V.</given-names></name>
</person-group> (<year>2023</year>). &#x201c;
<article-title>Edge-cascade: A lightweight CNN architecture for real-time obstacle detection on USVs</article-title>,&#x201d; in <conf-name>2023 IEEE International Conference on Robotics and Automation (ICRA)</conf-name>, (<publisher-loc>London, UK</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>11877</fpage>&#x2013;<lpage>11883</lpage>.
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Song</surname> <given-names>W.-J.</given-names></name>
<name><surname>Im</surname> <given-names>K.-S.</given-names></name>
<name><surname>Lee</surname> <given-names>S.-K.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>A review on the USV technologies for safety surveillance and disaster prevention in the ocean</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>12</volume>, <issue>7</issue>, <elocation-id>1140</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse12071140</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sun</surname> <given-names>F.</given-names></name>
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Xie</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>Z.</given-names></name>
<name><surname>Yang</surname> <given-names>C.</given-names></name>
<name><surname>Qi</surname> <given-names>J.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Review of deep learning applied to occluded object detection</article-title>. <source>J. Front. Comput. Sci. Technol.</source> <volume>16</volume>, <issue>8</issue>, <fpage>1673</fpage>&#x2013;<lpage>9418</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3778/j.issn.1673-9418.2104046</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tan</surname> <given-names>M.</given-names></name>
<name><surname>Le</surname> <given-names>Q. V.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>EfficientNet: Rethinking model scaling for convolutional neural networks</article-title>,&#x201d; in <conf-name>Proceedings of the International Conference on Machine Learning</conf-name>, (<publisher-loc>Long Beach, CA, USA</publisher-loc>: <issue>PMLR</issue>. <fpage>6105</fpage>&#x2013;<lpage>6114</lpage>.
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tan</surname> <given-names>M.</given-names></name>
<name><surname>Le</surname> <given-names>Q. V.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>EfficientNetV2: Smaller models and faster training</article-title>,&#x201d; in <conf-name>Proceedings of the International Conference on Machine Learning</conf-name>, (<publisher-loc>Vienna, Austria</publisher-loc>: 
<publisher-name>PMLR</publisher-name>). <fpage>10096</fpage>&#x2013;<lpage>10106</lpage>.
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tang</surname> <given-names>L.</given-names></name>
<name><surname>Xu</surname> <given-names>Y.</given-names></name>
<name><surname>Xu</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Robust ship tracking in maritime surveillance videos under complex occlusion conditions</article-title>. <source>IEEE Trans. Intelligent Transportation Syst.</source> <volume>23</volume>, <issue>10</issue>, <fpage>19356</fpage>&#x2013;<lpage>19368</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TITS.2022.3142795</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tersek</surname> <given-names>M.</given-names></name>
<name><surname>Zust</surname> <given-names>L.</given-names></name>
<name><surname>Kristan</surname> <given-names>M.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>eWaSR-An embedded-compute-ready maritime obstacle detection network</article-title>. <source>Sensors</source> <volume>23</volume><issue>12</issue>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s23125386</pub-id>, PMID: <pub-id pub-id-type="pmid">37420553</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>C.-Y.</given-names></name>
<name><surname>Bochkovskiy</surname> <given-names>A.</given-names></name>
<name><surname>Liao</surname> <given-names>H.-Y. M.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Scaled-YOLOv4: Scaling cross stage partial network</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Nashville, TN, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>13024</fpage>&#x2013;<lpage>13033</lpage>.
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>N.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Feng</surname> <given-names>Y.</given-names></name>
<name><surname>Wei</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2024</year>a). 
<article-title>AodeMar: Attention-aware occlusion detection of vessels for maritime autonomous surfaceships</article-title>. <source>IEEE Trans. Intelligent Transportation Syst.</source> <volume>25</volume>, <fpage>13584</fpage>&#x2013;<lpage>13597</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TITS.2024.3398733</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>N.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Wei</surname> <given-names>Y.</given-names></name>
<name><surname>Han</surname> <given-names>B.</given-names></name>
<name><surname>Feng</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2024</year>b). 
<article-title>Marine vessel detection dataset and benchmark for unmanned surface vehicles</article-title>. <source>Appl. Ocean Res.</source> <volume>142</volume>, <fpage>103835</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.apor.2023.103835</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>T.</given-names></name>
<name><surname>Yuan</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Feng</surname> <given-names>J.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Distilling object detectors with fine grained feature imitation</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Long Beach, CA, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>4928</fpage>&#x2013;<lpage>4937</lpage>.
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>P.</given-names></name>
<name><surname>Huang</surname> <given-names>H.</given-names></name>
<name><surname>Qian</surname> <given-names>H.</given-names></name>
<name><surname>Su</surname> <given-names>S.</given-names></name>
<name><surname>Sun</surname> <given-names>B.</given-names></name>
<name><surname>Zuo</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>SRCANet: Stacked residual coordinate attention network for infrared ship detection</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2022.3218563</pub-id>
</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>C. M.</given-names></name>
<name><surname>Lei</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>Z. Q.</given-names></name>
<name><surname>Ren</surname> <given-names>M. L.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Ship_YOLO: General ship detection based on mixed distillation and dynamic task-aligned detection head</article-title>. <source>Ocean Eng.</source> <volume>323</volume>, <elocation-id>120616</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.oceaneng.2025.120616</pub-id>
</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xiang</surname> <given-names>W.</given-names></name>
<name><surname>Pan</surname> <given-names>C.</given-names></name>
<name><surname>Liu and Liu</surname> <given-names>J. Y</given-names></name>
</person-group>. (<year>2024</year>). 
<article-title>A ghost and attention mechanism based deep learning approach for SAR small target image detection</article-title>. <source>Chiang Mai J. Sci</source>. <volume>51</volume>, <issue>5</issue>, <elocation-id>e2024076</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.12982/CMJS.2024.076</pub-id>
</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>Z.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>B.</given-names></name>
<name><surname>Ding</surname> <given-names>S.</given-names></name>
<name><surname>Jiang</surname> <given-names>P.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>A lightweight sea surface object detection network for unmanned surface vehicles</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>10</volume>, <fpage>965</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse10070965</pub-id>
</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yun</surname> <given-names>S.</given-names></name>
<name><surname>Han</surname> <given-names>D.</given-names></name>
<name><surname>Chun</surname> <given-names>S.</given-names></name>
<name><surname>Oh</surname> <given-names>S. J.</given-names></name>
<name><surname>Yoo</surname> <given-names>Y.</given-names></name>
<name><surname>Choe</surname> <given-names>J.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>CutMix: Regularization strategy to train strong classifiers with localizable features</article-title>,&#x201d; in <conf-name>IEEE International Conference on Computer Vision</conf-name>, (<publisher-loc>Seoul, South Korea</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>6022</fpage>&#x2013;<lpage>6031</lpage>.
</mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zeng</surname> <given-names>G.</given-names></name>
<name><surname>Yu</surname> <given-names>W.</given-names></name>
<name><surname>Wang</surname> <given-names>R.</given-names></name>
<name><surname>Lin</surname> <given-names>A.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Research on mosaic image data enhancement and detection method for overlapping ship targets</article-title>. <source>Control Theory Appl.</source> <volume>39</volume>, <issue>6</issue>, <fpage>1139</fpage>&#x2013;<lpage>1148</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.7641/CTA.2021.10329</pub-id>
</mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Zhou</surname> <given-names>X.</given-names></name>
<name><surname>Lin</surname> <given-names>M.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>ShuffleNet: An extremely efficient convolutional neural network for mobile devices</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, (<publisher-loc>Salt Lake City, UT, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>6848</fpage>&#x2013;<lpage>6856</lpage>.
</mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhao</surname> <given-names>S.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>Towards occlusion handling: object tracking with background estimation</article-title>. <source>IEEE Trans. Cybernetics</source> <volume>48</volume>, <fpage>2086</fpage>&#x2013;<lpage>2100</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TCYB.2017.2727138</pub-id>, PMID: <pub-id pub-id-type="pmid">28767381</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2076292">Chengbo Wang</ext-link>, Xidian University, China</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3205604">Zeguo Zhang</ext-link>, Guangdong Ocean University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3343985">Jiong Li</ext-link>, Shanghai Maritime University, China</p></fn>
</fn-group>
<fn-group>
<fn id="fn1"><label>1</label>
<p><ext-link ext-link-type="uri" xlink:href="https://github.com/yyuanwang1010/MVDD13">https://github.com/yyuanwang1010/MVDD13</ext-link>.</p></fn>
<fn id="fn2"><label>2</label>
<p><ext-link ext-link-type="uri" xlink:href="https://sites.google.com/site/dilipprasad/home/singapore-maritime-dataset">https://sites.google.com/site/dilipprasad/home/singapore-maritime-dataset</ext-link>.</p></fn>
<fn id="fn3"><label>3</label>
<p><ext-link ext-link-type="uri" xlink:href="https://mmdetection.readthedocs.io/zhCN/latest/userguides/usefultools">https://mmdetection.readthedocs.io/zhCN/latest/userguides/usefultools</ext-link>. html?highlight = FPSfps.</p></fn>
</fn-group>
</back>
</article>