<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2024.1513740</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Marine Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>LFN-YOLO: precision underwater small object detection via a lightweight reparameterized approach</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Mingxin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1911654"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Yujie</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2723350"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Ruixin</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2569115"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Lin</surname>
<given-names>Cong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2887889"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Electronics and Information Engineering, Guangdong Ocean University</institution>, <addr-line>Zhanjiang</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Guangdong Provincial Key Laboratory of Intelligent Equipment for South China Sea Marine Ranching</institution>, <addr-line>Zhanjiang</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>College of Naval Architecture and Shipping, Guangdong Ocean University</institution>, <addr-line>Zhanjiang</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Narayanamoorthi R., SRM Institute of Science and Technology, India</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Zhaoqiang Xia, Northwestern Polytechnical University, China</p>
<p>Hao Wang, China University of Petroleum, China</p>
<p>Mingzhi Chen, University of Shanghai for Science and Technology, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Cong Lin, <email xlink:href="mailto:lincong@gdou.edu.cn">lincong@gdou.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>23</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1513740</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>10</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Liu, Wu, Li and Lin</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Liu, Wu, Li and Lin</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Underwater object detection plays a significant role in fisheries resource assessment and ecological environment protection. However, traditional underwater object detection methods struggle to achieve accurate detection in complex underwater environments with limited computational resources. This paper proposes a lightweight underwater object detection network called LightFusionNet-YOLO (LFN-YOLO). First, we introduce the reparameterization technique RepGhost to reduce the number of parameters while enhancing training and inference efficiency. This approach effectively minimizes precision loss even with a lightweight backbone network. Then, we replaced the standard depthwise convolution in the feature extraction network with SPD-Conv, which includes an additional pooling layer to mitigate detail loss. This modification effectively enhances the detection performance for small objects. Furthermore, We employed the Generalized Feature Pyramid Network (GFPN) for feature fusion in the network's neck, enhancing the network's adaptability to features of varying scales. Finally, we design a new detection head, CLLAHead, which reduces computational costs and strengthens the robustness of the model through cross-layer local attention. At the same time, the DFL loss function is introduced to reduce regression and classification errors. Experiments conducted on public datasets, including URPC, Brackish, and TrashCan, showed that the mAP@0.5 reached 74.1%, 97.5%, and 66.2%, respectively, with parameter sizes and computational complexities of 2.7M and 7.2 GFLOPs, and the model size is only 5.9 Mb. Compared to mainstream vision models, our model demonstrates superior performance. Additionally, deployment on the NVIDIA Jetson AGX Orin edge computing device confirms its high real-time performance and suitability for underwater applications, further showcasing the exceptional capabilities of LFN-YOLO.</p>
</abstract>
<kwd-group>
<kwd>underwater object detection</kwd>
<kwd>lightweight detector</kwd>
<kwd>small object</kwd>
<kwd>marine resources</kwd>
<kwd>multi-scale feature fusion</kwd>
</kwd-group>
<counts>
<fig-count count="13"/>
<table-count count="4"/>
<equation-count count="8"/>
<ref-count count="48"/>
<page-count count="19"/>
<word-count count="7732"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Ocean Observation</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Underwater object detection plays a crucial role in fisheries resource assessment and the ecological environment protection. As global attention on sustainable development increases, accurately monitoring the state of underwater ecosystems and resources becomes particularly important (<xref ref-type="bibr" rid="B10">Grip and Blomqvist, 2020</xref>). With challenges in complex underwater environments, including insufficient lighting, clutter interference, and limited computational resources, traditional underwater object detection methods struggle to achieve optimal detection accuracy (<xref ref-type="bibr" rid="B8">Er et&#xa0;al., 2023</xref>) (<xref ref-type="bibr" rid="B22">Liu et&#xa0;al., 2023b</xref>). The above issues limit effective resource management and ecological monitoring. Therefore, developing efficient and reliable underwater object detection technology not only helps improve the accuracy of fisheries resource assessments but also provides scientific evidence for ecological protection, ensuring the sustainable development of marine ecosystems (<xref ref-type="bibr" rid="B48">Zhou et&#xa0;al., 2024</xref>).</p>
<p>In recent years, deep learning-based object detection technology has been widely applied in various fields. Deep learning-based object detection algorithms are generally categorized into two-stage and onestage detection algorithms. The former involves generating candidate regions first and then classifying and localizing these regions, which leads to high detection accuracy but with the downside of complex structures and low real-time performance, Notable examples include Faster R-CNN (<xref ref-type="bibr" rid="B29">Ren et&#xa0;al., 2017</xref>), R-FCN (<xref ref-type="bibr" rid="B7">Dai et&#xa0;al., 2016</xref>), and Mask R-CNN (<xref ref-type="bibr" rid="B12">He et&#xa0;al., 2017</xref>). The latter completes object detection in a single forward propagation without generating candidate regions, resulting in a simplified structure and a more lightweight model that effectively balances accuracy and speed. These methods perform well in various scenarios, with YOLO (<xref ref-type="bibr" rid="B28">Redmon et&#xa0;al., 2016</xref>), SSD (<xref ref-type="bibr" rid="B21">Liu et&#xa0;al., 2016</xref>), and RetinaNet (<xref ref-type="bibr" rid="B30">Ross and Dollar, 2017</xref>) are notable examples. YOLO, proposed by Joseph Redmon in 2016, transformed object detection into a single regression problem and achieved real-time object detection by dividing images into grids.</p>
<p>YOLOv8, the most representative algorithm in the YOLO series, strikes a good balance between accuracy and model size, making it more suitable for industrial applications. However, YOLOv8 was not specifically designed for underwater environments, leaving room for improvement in underwater detection tasks. Building upon YOLOv8, we propose the Light Fusion Net YOLO (LFN-YOLO) model to enhance the lightweight characteristics and performance of underwater target detection models. Experimental results demonstrate that the model performs exceptionally well on the URPC (Zhanjiang, 2021 China Underwater Robot Professional Contest) dataset, with a 2.2% increase in mAP@0.5 and a 19.1% reduction in GFLOPs, achieving only 7.2 GFLOPs. The parameters were reduced by 15.6%, down to 2.6M. Furthermore, LFN-YOLO demonstrated excellent performance on the Brackish dataset, achieving the highest accuracy and the smallest model size in comparison experiments with other mainstream one-stage detection algorithms. This demonstrates that LFN-YOLO strikes a better balance between accuracy and model complexity, making it suitable for underwater target detection tasks on platforms with limited hardware capabilities. The main contributions of this paper are as follows:</p>
<p>1) To reduce the number of network parameters and computational complexity while enhancing training and inference efficiency, a reparameterization approach is employed in the backbone network to facilitate feature reuse. Furthermore, SPD-Conv is utilized in the feature extraction process to enhance the ability to capture small object features effectively.</p>
<p>2) To improve the network&#x2019;s ability to adapt to features of varying sizes, the Generalized Feature Pyramid Network was applied for feature fusion, which effectively fuses geometric detail information from lowlevel features with semantic information from high-level features, allowing better feature extraction for underwater objects of varying sizes.</p>
<p>3) A lightweight detection head, CLLAHead, was designed in this paper, which incorporates a cross-layer local attention mechanism. This design reduces unnecessary computations and enhances the model&#x2019;s robustness in underwater environments. Additionally, the Distribution Focal Loss was introduced to minimize both regression and classification losses in target detection.</p>
<p>4) The proposed LFN-YOLO demonstrates superior performance in detection accuracy, network lightweight, and adaptability to underwater environments. Additionally, this paper presents an efficient underwater deployment solution. With the optimized network architecture, LFN-YOLO shows improved detection accuracy and higher FPS in real underwater scenarios.</p>
<p>The paper is organized as follows. Section 2 reviews the development of underwater object detection research, along with related work on lightweight networks and small object detection. Section 3 provides a detailed introduction to the network structure of LFN-YOLO, covering the overall design and the internal principles of each module. Section 4 describes the experimental setup, including datasets, evaluation metrics, equipment, and software. Section 5 presents the experimental results and analysis, including ablation and comparative experiments, as well as underwater deployment experiments. In Section 6, the generality and robustness of the LFN-YOLO model are evaluated using the TrashCan dataset. Finally, Section 7 concludes the paper.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Underwater object detection</title>
<p>In recent years, deep learning-based underwater object detection models have rapidly evolved. Many researchers have focused on developing algorithms to tackle the challenges of underwater images, which often suffer from high noise, low contrast, and color distortion (<xref ref-type="bibr" rid="B45">Zhang et&#xa0;al., 2024b</xref>). <xref ref-type="bibr" rid="B38">Wang et&#xa0;al. (2023)</xref> proposed a reinforcement learning paradigm for underwater visual enhancement, which simultaneously optimizes the target detection and visual enhancement tasks. However, the variability of underwater environments poses limitations for the visual enhancement algorithm. To address this, <xref ref-type="bibr" rid="B39">Wang et&#xa0;al. (2024)</xref> introduced a new underwater image enhancement method that can select an enhancement technique and configuration parameters based on the degree of image degradation, thereby improving the effectiveness of the enhancement for practical applications. Additionally, underwater environments present unique challenges for object detection, such as background interference, dense object distribution, and occlusion. These issues contrast sharply with those in conventional detection scenarios, highlighting the complexity and necessity of robust underwater detection methods (<xref ref-type="bibr" rid="B16">Jian et&#xa0;al., 2021</xref>). For instance, <xref ref-type="bibr" rid="B37">Wang et&#xa0;al. (2022)</xref> proposed an enhanced YOLO network without anchor points. They utilized Retinex theory to eliminate impurities in underwater images and subsequently performed multi-scale feature fusion in the YOLO network. This approach reduced the inference time for regression and classification tasks while improving the accuracy of underwater object detection. <xref ref-type="bibr" rid="B43">Yan et&#xa0;al. (2023)</xref> proposed a dual adversarial contrastive learning enhancement network for underwater images. This network transforms degraded waters into high-quality waters and builds an inverse circulation net mapping in a self-learning manner, reducing dependency on training data and significantly enhancing the quality of underwater images. <xref ref-type="bibr" rid="B24">Liu et&#xa0;al. (2023a)</xref> proposed the YOLOv7-AC network for underwater object detection, which replaces the YOLOv7 convolution module with an ACmixBlock and incorporates global attention in the backbone network. Additionally, the K-means algorithm was employed to optimize the anchor box selection, improving both average precision and inference speed. <xref ref-type="bibr" rid="B47">Zhao et&#xa0;al. (2023)</xref> introduced the YOLOv7-CHS model, which integrates a non-contextual transformer module with parameter-free attention to learn spatial and channel relationships, resulting in enhanced detection performance. <xref ref-type="bibr" rid="B46">Zhang et&#xa0;al. (2024a)</xref> proposed the FasterNetT0 as the backbone network, reducing the number of parameters and computational complexity. They further added a small object detection head to improve accuracy for small targets, and used Deformable ConvNets and channel attention mechanisms in the neck to handle irregularly shaped and occluded objects. A comprehensive qualitative comparison of underwater object detection methods developed in recent years, as shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. However, these underwater detection models primarily focus on accurate object identification, without fully considering the need for lightweight models that can be efficiently deployed in real world scenarios.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Comprehensive qualitative comparison of underwater object detection methods developed in recent years.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Method</th>
<th valign="top" align="center">Dataset</th>
<th valign="top" align="center">Backbone</th>
<th valign="top" align="center">Method highlights</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Enhanced YOLO (<xref ref-type="bibr" rid="B37">Wang et&#xa0;al., 2022</xref>)</td>
<td valign="top" align="center">LED water tank image</td>
<td valign="top" align="center">Resnet</td>
<td valign="top" align="center">Retinex theory</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv7-AC (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2023a</xref>)</td>
<td valign="top" align="center">URPC, Brackish</td>
<td valign="top" align="center">Darknet53</td>
<td valign="top" align="center">K-means algorithm for anchor box generation</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv7-CHS (<xref ref-type="bibr" rid="B47">Zhao et&#xa0;al., 2023</xref>)</td>
<td valign="top" align="center">Starfish, DUO</td>
<td valign="top" align="center">HOSI-Darknet53</td>
<td valign="top" align="center">High-order spatial interaction, Contextual transformer</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8 improved (<xref ref-type="bibr" rid="B46">Zhang et&#xa0;al., 2024a</xref>)</td>
<td valign="top" align="center">UTDAC2020, Pascal VOC</td>
<td valign="top" align="center">FasterNet-T0</td>
<td valign="top" align="center">Deformable ConvNets</td>
</tr>
<tr>
<td valign="top" align="left">CHE-YOLO (<xref ref-type="bibr" rid="B9">Feng and Jin, 2024</xref>)</td>
<td valign="top" align="center">DUO, UTDAC2020</td>
<td valign="top" align="center">Darknet-53</td>
<td valign="top" align="center">High-order deformable attention, Enhanced spatial pyramid pooling-fast</td>
</tr>
<tr>
<td valign="top" align="left">LFN-YOLO (Ours)</td>
<td valign="top" align="center">URPC, Brackish</td>
<td valign="top" align="center">RepGhostNet</td>
<td valign="top" align="center">Cross-Level Local Attention, Detecting head</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Our research aims to improve the accuracy of object detection models in underwater environments while reducing network parameters and computational complexity to enable deployment on hardware with varying performance levels. To accomplish this, we designed CLLAHead, which incorporates a cross-layer local attention mechanism (<xref ref-type="bibr" rid="B35">Tang and Li, 2020</xref>) and introduced the Distribution Focal Loss (DFL) (<xref ref-type="bibr" rid="B19">Li et&#xa0;al., 2023</xref>). These improvements have enhanced the model&#x2019;s ability to accurately identify and locate objects in underwater environments, while also reducing unnecessary computational overhead and hardware resource requirements.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Lightweight network</title>
<p>In recent years, the rapid development of Graphics Processing Units (GPUs) has accelerated the growth of deep neural networks (DNNs) across various fields. Simultaneously, the deployment of DNN models on resource-limited devices, such as mobile and edge devices, has become increasingly common. These devices often have constrained computational power and storage, posing challenges for DNN deployment. Balancing high accuracy with reduced model size and computational complexity is a key challenge that needs to be addressed (<xref ref-type="bibr" rid="B41">Xu et&#xa0;al., 2023a</xref>).</p>
<p>Currently, lightweight models are primarily achieved through two approaches: network architecture design and model compression (<xref ref-type="bibr" rid="B20">Lin et&#xa0;al., 2024</xref>). The former involves designing a more efficient network structure to reduce the number of parameters and floating-point operations (FLOPs). Popular networks in this category include MobileNet (V1, V2, V3) (<xref ref-type="bibr" rid="B14">Howard, 2017</xref>) (<xref ref-type="bibr" rid="B31">Sandler et&#xa0;al., 2018</xref>) (<xref ref-type="bibr" rid="B15">Howard et&#xa0;al., 2019</xref>), EfficientNet (<xref ref-type="bibr" rid="B34">Tan and Le, 2019</xref>), GhostNet (<xref ref-type="bibr" rid="B11">Han et&#xa0;al., 2020</xref>), and FasterNet (<xref ref-type="bibr" rid="B3">Chen et&#xa0;al., 2023a</xref>), which have significantly contributed advancing to deep learning on mobile and edge devices. On the other hand, model compression techniques&#x2014;such as pruning, quantization, and knowledge distillation&#x2014;focus on reducing parameters and complexity while maintaining performance.</p>
<p>In object detection, network architecture design is a commonly used method for lightweight. <xref ref-type="bibr" rid="B6">Cheng et&#xa0;al. (2023b)</xref> proposed replacing the YOLOv4 feature extraction backbone with the lightweight MobileViT network, effectively extracting both local and global features of objects while reducing model complexity. <xref ref-type="bibr" rid="B32">Shang et&#xa0;al. (2023)</xref> suggested using ShuffleNetv2 to replace the YOLOv5 backbone, which reduces memory access costs and convolution operations, leading to a smaller model size and faster detection speeds. <xref ref-type="bibr" rid="B46">Zhang et&#xa0;al. (2024a)</xref> introduced the FasterNet network to replace the YOLOv8 backbone for lightweight underwater object detection, aiming to reduce parameters and computational complexity while maintaining accuracy. Although these methods contribute to lightweight, they often fail to achieve a satisfactory level of accuracy. Therefore, we leverage SPD-Conv (<xref ref-type="bibr" rid="B33">Sunkara and Luo, 2022</xref>) for feature extraction and incorporate RepGhost for reparameterized feature reuse (<xref ref-type="bibr" rid="B2">Chen et&#xa0;al., 2024</xref>) in the backbone network, ensuring that the underwater object detection model achieves improved accuracy while maintaining a lightweight backbone structure.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Small object detection</title>
<p>In object detection tasks, deep neural networks (DNNs) typically recognize objects by capturing edge features and geometric cues. However, underwater images often present significant challenges due to occlusion and the presence of small objects, making underwater object detection particularly difficult. Improving the model&#x2019;s ability to detect small objects is crucial for practical applications in underwater object detection.</p>
<p>Small objects are generally categorized into two types based on their definition: relative and absolute small objects (<xref ref-type="bibr" rid="B36">Tong and Wu, 2022</xref>). Relative small objects refer to targets whose area is less than 1% of the image area, while absolute small objects are defined based on fixed size thresholds. For instance, in the MS-COCO dataset, absolute small objects are defined as those with dimensions smaller than 32&#xd7;32 pixels (<xref ref-type="bibr" rid="B18">Krishna and Jawahar, 2017</xref>). These definitions provide a basis for evaluating the performance of object detection models in various contexts, especially in scenarios with complex underwater environments.</p>
<p>Effective multi-scale feature fusion can significantly enhance the model&#x2019;s ability to detect small objects. Multi-scale feature fusion involves combining geometric details and positional information from low-level feature maps with rich semantic information from high-level feature maps. Notable methods include the Feature Pyramid Network (FPN), the Asymptotic Feature Pyramid Network (AFPN), and the Bidirectional Feature Pyramid Network (BiFPN). For example, <xref ref-type="bibr" rid="B44">Zhai et&#xa0;al. (2023)</xref> introduced a Global Attention Mechanism (GAM) into the neck of YOLOv8, enabling the network to improve the interaction of global dimension features and fuse key features, thereby increasing the speed and accuracy of small object detection. <xref ref-type="bibr" rid="B1">Bao et&#xa0;al. (2023)</xref> employed a Double Dimensional Mixed Attention (DDMA) mechanism to fuse local and non-local attention information in the YOLOv5 network, reducing the missed detections caused by densely packed small objects. <xref ref-type="bibr" rid="B25">Ma et&#xa0;al. (2024)</xref> used the Enhanced Spatial Feature Pyramid Network (ESFPN) to combine high-resolution and low-resolution semantic information, creating additional high-resolution pyramid layers to improve small object detection capabilities. However, these methods are not well-suited for the unique conditions of underwater environments. To address the challenge of misdetections and missed detections caused by the varying scales of underwater objects, we propose employing the Generalized Feature Pyramid Network (GFPN) (<xref ref-type="bibr" rid="B17">Jiang et&#xa0;al., 2022</xref>) for feature fusion. This approach effectively utilizes the feature information of small objects, enhancing the robustness of the model in detecting small objects.</p>
</sec>
</sec>
<sec id="s3" sec-type="materials|methods">
<label>3</label>
<title>Materials and methods</title>
<p>In this paper, we propose a network specifically designed for underwater object detection, which improves the detection performance of small objects while maintaining accurate detection of normal-sized objects, and reduces the model&#x2019;s parameter count and computational complexity. The structure of the proposed network is shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. First, We introduce the reparameterization module RepGhost into the backbone of the detection network to achieve efficient feature reuse. Second, in the feature extraction network, we replace the standard depthwise convolution with SPD-Conv to prevent the loss of detail. Then, we employed GFPN to enhance the fusion of high-level semantic information and low-level spatial information in the neck of the network. Finally, we propose a new detection head, CLLAHead, which integrates cross-layer local attention mechanisms with Distribution Focal Loss (DFL) to improve object recognition and localization, thereby forming the LFN-YOLO underwater object detection model.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Illustration of the network structure of LFN-YOLO.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g001.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>RepGhost reparameterization module</title>
<p>Feature reuse plays an essential role in lightweight convolutional neural networks (<xref ref-type="bibr" rid="B26">Minaee et&#xa0;al., 2022</xref>). Existing feature reuse methods often use concatenation operations to reuse feature maps from different layers, which helps maintain a larger number of channels but results in a higher computational cost on hardware devices, posing challenges for real-world applications. To address this issue, we propose the RepGhost module, which uses structural reparameterization techniques to achieve feature reuse, eliminating the need for computationally expensive concatenation operations.</p>
<p>The RepGhost module is a lightweight convolutional module that replaces the concatenation operation used in Ghost modules with an additional operation, which is more efficient in terms of computation. The ReLU activation layer is moved behind the depthwise convolution and additional layers to conform to the rules of reparameterized structures. Lastly, a batch normalization (BN) branch is added during training, which is then fused with the depthwise convolution during inference, reducing floating-point operations. <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> illustrates the reparameterization process of RepGhost.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>
<bold>(A&#x2013;C)</bold> RepGhost module details.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g002.tif"/>
</fig>
<p>By introducing the RepGhost module into the backbone network of YOLOv8, we can train the object detection model more efficiently. During the inference stage, this approach enhances detection speed while minimizing accuracy loss, achieving a balance between simplifying model complexity and ensuring high detection performance. This enables the model to meet the demands of object detection tasks in scenarios with limited hardware resources, making it suitable for industrial applications.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>SPD-Conv</title>
<p>In object detection, especially when dealing with small objects, the amount of feature information is often limited. Standard stride convolutions and pooling can lead to a loss of detail, which is a major factor contributing to the low detection efficiency for small objects (<xref ref-type="bibr" rid="B5">Cheng et&#xa0;al., 2023a</xref>). To mitigate this issue, we introduce the SPD-Conv method, which replaces the standard convolution layers in the feature extraction network of YOLOv8.</p>
<p>The SPD-Conv is composed of a Space-to-Depth layer followed by a non-strided convolution layer. The Space-to-Depth layer downsamples the original feature map while preserving the information in the channel dimension, with this downsampling involving only a rearrangement of the data along the channel dimension, avoiding information loss. For any intermediate feature map <italic>X</italic> of size <italic>S</italic> &#xd7; <italic>S</italic> &#xd7; <italic>C</italic>
<sub>1</sub>, we can generate a series of sub-feature maps according to <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>.</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="bold">X</mml:mtext>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="bold">X</mml:mtext>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="bold">X</mml:mtext>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="bold">X</mml:mtext>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="bold">X</mml:mtext>
<mml:mo stretchy="false">[</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x22ee;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="bold">X</mml:mtext>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="bold">X</mml:mtext>
<mml:mo stretchy="false">[</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale,</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>:</mml:mo>
<mml:mo>&#x2004;</mml:mo>
<mml:mtext>scale</mml:mtext>
<mml:mo>&#x2004;</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>These feature sub-maps <italic>f<sub>x,y</sub>
</italic> are composed of all elements <bold>X</bold>(<italic>j</italic> +<italic>i</italic>), which are divisible by both <italic>i</italic>+<italic>x</italic> and <italic>j</italic> +<italic>i</italic>. Therefore, each sub-map is obtained by downsampling the original feature map <bold>X</bold> by a scaling factor. These sub-maps are then concatenated along the channel dimension to form a new feature map <inline-formula>
<mml:math display="inline" id="im1">
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mstyle>
</mml:math>
</inline-formula>, where the space and dimensions are reduced by the scaling factor, and the channel dimension is increased by the square of the scaling factor. In other words, Space-to-Depth transforms <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>C</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> into an intermediate feature map <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mstyle>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>S&#xa0;Scale</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>S&#xa0;Scale</mml:mtext>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:msub>
<mml:mtext>C</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref> illustrates the process of Space-to-Depth conversion when the scaling factor is set to 2.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>
<bold>(A&#x2013;E)</bold> Illustration of SPD-Conv when scale = 2.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g003.tif"/>
</fig>
<p>After applying the Space-to-Depth transformation, a non-strided (i.e., stride=1) convolution layer with <italic>C</italic>
<sub>2</sub> filters is added, where C<sub>2</sub> <italic>&lt;</italic> scale<sup>2</sup>C<sub>1</sub>. The feature map is then further transformed from <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mstyle>
<mml:mo stretchy="false">(</mml:mo>
<mml:mfrac>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:msub>
<mml:mtext>C</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x2192;</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
</mml:mstyle>
<mml:mo stretchy="false">(</mml:mo>
<mml:mfrac>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:msub>
<mml:mtext>C</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. The reason for using non-strided convolution is to retain as much discriminative information as possible; otherwise, using stride=3 (as with a 3&#xd7;3 filter) would downsample the feature map but only sample each pixel once. If stride=2 were used, asymmetric sampling would occur, with different rows or columns being sampled at different times. Generally, strides greater than 1 leads to a loss of discriminative information. Although it may appear that this process downsamples the feature map from <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>S</mml:mtext>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>C</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x2192;</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
</mml:mstyle>
<mml:mo stretchy="false">(</mml:mo>
<mml:mfrac>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>Scale</mml:mtext>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:msub>
<mml:mtext>C</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, it fails to preserve the discriminative features of <inline-formula>
<mml:math display="inline" id="im6">
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mstyle>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>GFPN</title>
<p>In the feature extraction layers, the shallow layers have small receptive fields and limited ability to represent semantic information, but they are better at capturing geometric details with high-resolution feature maps, making them suitable for perceiving position and geometric details. In contrast, deeper layers have larger receptive fields and stronger semantic representation capabilities, but they are weaker at capturing geometric information and have lower resolution feature maps (<xref ref-type="bibr" rid="B4">Chen et&#xa0;al., 2023b</xref>). Therefore, enhancing the exchange of high-level semantic information with low-level spatial information is key to handling objects of varying scales (<xref ref-type="bibr" rid="B40">Xiao et&#xa0;al., 2025</xref>). To address this, we propose a novel cross-scale feature fusion method called the Generalized Feature Pyramid Network (GFPN). GFPN aggregates features from the same and adjacent levels to enable more efficient information transfer. It also employs skip connections to prevent gradient vanishing, improving the ability of features to propagate to deeper layers. While striking a balance between model size and performance, GFPN exhibits superior performance in feature fusion. The feature fusion structure of GFPN is illustrated in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The structure of GFPN, which takes feature maps extracted from different depths as input and outputs a set of fused feature maps that encapsulate rich semantic and spatial information.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g004.tif"/>
</fig>
<p>Since the GFPN structure is more complex compared to other feature fusion networks, its complexity increases with the depth of the layers leading to the issue of gradient vanishing. Inspired by the reparameterized GFPN used in DAMO-YOLO (<xref ref-type="bibr" rid="B42">Xu et&#xa0;al., 2023b</xref>), we adopt CSPStage to implement skip connections to replace the C2f (Cross Stage Partial Network Fusion) and combine convolutional layers, allowing information sharing between features across different spatial scales and non-adjacent semantic layers. This ensures that the network focuses on high-level semantic information while avoiding the loss of low-level spatial information.</p>
<p>The CSPStage module incorporates reparameterized convolutions (RepConv), which allow multiple computational branches to be fused during the inference phase, enhancing the efficiency and performance of the model. During training, RepConv uses multiple branches for convolution. During inference, the parameters from these branches are reparameterized into the main branch, thus reducing the computational load and memory requirements. By using CSPStage to implement skip connections, shallow feature information can be passed to deeper layers, minimizing the loss of features and enhancing the information exchange between shallow and deep layers. This improves the network&#x2019;s ability to adapt to targets of varying scales.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>CLLAHead</title>
<p>In scenarios where small objects are densely packed, the original detection head of YOLOv8 struggles to meet the demands of efficient and accurate detection. Therefore, we combined the Cross-Level Local Attention (CLLA) mechanism with Distribution Focal Loss (DFL) to design CLLAHead, which enhances the model&#x2019;s ability to recognize and localize objects in images.</p>
<p>The goal of CLLA is to model the contextual relationships between cross-level features and aggregate multi-level features, as shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>. Different levels of features typically contain different recognition information. To capture fine-grained contextual information across different feature levels and improve the detection accuracy and robustness, we embedded the CLLA module into the detection head.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>The proposed CLLA model structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g005.tif"/>
</fig>
<p>The CLLA module models the relationships between the channels and spatial dimensions of high-level and low-level feature maps. Among them, <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>2</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent low-level and mid-level feature maps, containing shallow information (such as texture, edges, and color), while <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>3</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> contains valuable deep semantic information. Then, average pooling and 1&#xd7;1 convolutions are applied to reduce the size of <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>2</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, unifying their spatial dimensions into new feature maps <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mtext>l</mml:mtext>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>2</mml:mn>
<mml:mtext>m</mml:mtext>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> (with the same dimensions as <inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>3</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>). Subsequently, three learnable parameters <bold>W<sup>Q</sup>
</bold>,<bold>W<sup>K</sup>
</bold> and <bold>W<sup>V</sup>
</bold> are used to project <inline-formula>
<mml:math display="inline" id="im12a">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mtext>l</mml:mtext>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, <bold>F<sup>m2</sup>
</bold> into the spaces of <bold>Q</bold>, <bold>K</bold>, and <bold>V</bold>, as shown in the following equation:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>Q</mml:mi>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mi>l</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>W</mml:mi>
</mml:mstyle>
<mml:mtext>Q</mml:mtext>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>K</mml:mi>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>2</mml:mn>
<mml:mi>m</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>W</mml:mi>
</mml:mstyle>
<mml:mtext>K</mml:mtext>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>V</mml:mi>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>2</mml:mn>
<mml:mi>m</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>W</mml:mi>
</mml:mstyle>
<mml:mtext>V</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Next, the dot product and the softmax function are used to calculate the correlation weights between <bold>Q</bold> and <bold>K</bold>, followed by a dot product with <bold>V</bold> to form a new feature map. This new feature map is then added to <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>3</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> to finally aggregate into <bold>F<sub>M</sub>
</bold>, which can be expressed by the following equation:</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>M</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>3</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:mtext>softmax</mml:mtext>
<mml:mo>(</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mi>l</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>W</mml:mi>
</mml:mstyle>
<mml:mi>Q</mml:mi>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>2</mml:mn>
<mml:mi>m</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>W</mml:mi>
</mml:mstyle>
<mml:mtext>K</mml:mtext>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mtext>T</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mo>)</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>2</mml:mn>
<mml:mi>m</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>W</mml:mi>
</mml:mstyle>
<mml:mtext>V</mml:mtext>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Distribution Focal Loss (DFL) enables the network to focus on values near the target label quickly, maximizing the probability density around the label. This guides the model to pay attention to difficult-todetect targets, improving its ability to detect small objects. To optimize the probabilities at two positions near the label <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mtext>&#xa0;and&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, DFL uses the cross-entropy function to concentrate the network&#x2019;s distribution around the target label value. The formula for DFL is given below:</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mtext>DF</mml:mtext>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>s<sub>i</sub>
</italic> is the Sigmoid output of the network, and <italic>y<sub>i</sub>
</italic> and <italic>y<sub>i</sub>
</italic>
<sub>+1</sub> represent the interval labels for <inline-formula>
<mml:math display="inline" id="im17">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiment preparation</title>
<p>To evaluate and validate the detection performance of our proposed model architecture, we conducted experiments using two challenging underwater object detection datasets.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Dataset</title>
<p>To verify the effectiveness of the proposed method in this paper, the dataset used for the experiment contains two parts. One part is from the 2021 China Underwater Robot Professional Contest (URPC) dataset (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2021</xref>), which consists of 7,543 images in total. The dataset includes four categories of objects: holothurian, echinus, scallop, and starfish. The dataset presents challenges such as occlusions, overlapping objects, and small-sized underwater targets. In addition, it also includes significant color distortion caused by the absorption and scattering of light at different wavelengths underwater. This phenomenon results in a predominance of blue and green tones in the images, while red and other long-wavelength colors are heavily attenuated. These unique optical properties pose challenges in accurately recognizing and classifying underwater objects, making this dataset particularly valuable for testing methods aimed at enhancing robustness under such conditions. We randomly split the dataset into training, validation, and testing sets in a 7:2:1 ratio. Specifically, 5,280 images were used for training, 1,463 for validation, and 800 images were reserved for testing and performance evaluation. <xref ref-type="fig" rid="f6">
<bold>Figures&#xa0;6A, B</bold>
</xref> present the distribution of target counts across different categories in the URPC dataset, along with the height and width of bounding boxes. The analysis indicates that most underwater organisms within the dataset are relatively small, with the majority of bounding box dimensions falling within the range of (0-0.1, 0-0.1).</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>
<bold>(A, B)</bold> represent the number of different species and the label size distribution in the URPC dataset, respectively. <bold>(C, D)</bold> show the number of different species and the label size distribution in the Brackish dataset, respectively. <bold>(E)</bold> illustrates the representative underwater environments of the URPC and Brackish datasets.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g006.tif"/>
</fig>
<p>The other part is The Brackish dataset (<xref ref-type="bibr" rid="B27">Pedersen et&#xa0;al., 2019</xref>), which is a publicly available European underwater image dataset consisting of 11,205 images in total. It includes six categories of small marine organisms: crabs, normal-sized fish, small-sized fish, starfish, shrimp, and jellyfish. The Brackish dataset contains a significant number of small underwater targets. Moreover, the presence of numerous suspended particles in the water results in image blurring, reduced contrast, and even scattering artifacts, posing considerable challenges for accurate detection and recognition. The dataset also suffers from low image resolution, further complicating the detection and recognition of small marine organisms. These environmental factors make the Brackish dataset an essential benchmark for evaluating the performance of detection algorithms in turbid and low-visibility conditions. The dataset was randomly split into training, validation, and testing sets in an 8:1:1 ratio. <xref ref-type="fig" rid="f6">
<bold>Figures&#xa0;6C, D</bold>
</xref> illustrate the visual attributes of the Brackish dataset. Additionally, <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6E</bold>
</xref> illustrates the representative underwater environments of the URPC and Brackish datasets.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Evaluation metrics</title>
<p>In this paper, we use Precision (P), Recall (R), mean Average Precision (mAP), Giga Floating-point Operations Per Second (GFLOPs), the number of parameters, and Frames Per Second (FPS) to evaluate the effectiveness of the model. Precision (P) reflects the accuracy of classifying positive samples, while Recall (R) indicates the effectiveness of identifying positive samples. mAP represents the mean precision across all categories. GFLOPs is a commonly used metric for measuring the computational complexity of a model, representing the number of floating-point operations executed per second. The number of parameters indicates the model&#x2019;s size. These metrics are widely adopted for evaluating object detection tasks. The formulas are as follows (<xref ref-type="disp-formula" rid="eq5">Equations 5</xref>&#x2013;<xref ref-type="disp-formula" rid="eq8">8</xref>):</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mtext>Recall</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mtext>AP</mml:mtext>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mi>P</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mtext>mAP</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Here, TP represents the number of true positive samples correctly predicted by the model, FP represents the number of false positive samples, TN represents the number of true negative samples, and FN represents the number of false negative samples. P(r) represents the Precision-Recall curve, and k denotes the number of classes in the current recognition task. In object detection tasks, the mAP is determined by the selected Intersection over Union (IoU) threshold. mAP@0.5 refers to the mean Average Precision achieved by the model in object detection tasks when the IoU threshold is set to 0.5.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Experimental platform</title>
<p>All experiments in this study were conducted on the same computer, running the Windows 10 operating system, with an Intel<sup>&#xae;</sup> Xeon<sup>&#xae;</sup> Silver 4100 CPU, an NVIDIA GeForce RTX 2080Ti GPU, Python version 3.8, CUDA version 11.7, and PyTorch version 2.0.0. The experiments involved training the model for 150 epochs, with the batch size set to 16 and the learning rate set to 0.01. The model gradients were optimized using the SGD optimizer. The edge deployment device utilizes the NVIDIA Jetson AGX Orin with 32GB of RAM and runs the Ubuntu 20.04 Focal operating system. It uses Python 3.8, CUDA 11.4, and PyTorch v1.12.0 for GPU acceleration. For the camera, we employ the IPC5MPW underwater camera, which features a 5MP resolution, a 36mm lens, and a 2m focal length.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Experimental results and analysis</title>
<sec id="s5_1">
<label>5.1</label>
<title>Ablation experiments</title>
<p>To validate the detection performance and model complexity of the proposed model in this study, as well as to explore the impact of specific network substructures on the model, we conducted ablation experiments based on YOLOv8n. The results are presented in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>, with the best results highlighted in bold.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Ablation study on detection performance and complexity.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="bottom" rowspan="2" align="left">Group</th>
<th valign="bottom" rowspan="2" align="left">RepGhost</th>
<th valign="bottom" rowspan="2" align="left">SPD-Conv</th>
<th valign="bottom" rowspan="2" align="left">GFPN</th>
<th valign="bottom" rowspan="2" align="left">CLLAHead</th>
<th valign="bottom" colspan="4" align="center">URPC(%)</th>
<th valign="bottom" colspan="4" align="center">Brackish(%)</th>
<th valign="bottom" rowspan="2" align="center">Parameters/M</th>
<th valign="bottom" rowspan="2" align="center">GFLOPs</th>
</tr>
<tr>
<th valign="bottom" align="center">P</th>
<th valign="bottom" align="center">R</th>
<th valign="bottom" align="center">mAP@0.5</th>
<th valign="bottom" align="center">mAP@0.5:0.95</th>
<th valign="bottom" align="center">P</th>
<th valign="bottom" align="center">R</th>
<th valign="bottom" align="center">mAP@0.5</th>
<th valign="bottom" align="center">mAP@0.5:0.95</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="bottom" align="left">1</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center">79.7</td>
<td valign="bottom" align="center">64.2</td>
<td valign="bottom" align="center">71.9</td>
<td valign="bottom" align="center">39.6</td>
<td valign="bottom" align="center">96.3</td>
<td valign="bottom" align="center">94.2</td>
<td valign="bottom" align="center">96.9</td>
<td valign="bottom" align="center">78.5</td>
<td valign="bottom" align="center">3.2</td>
<td valign="bottom" align="center">8.9</td>
</tr>
<tr>
<td valign="bottom" align="left">2</td>
<td valign="bottom" align="left">&#x2713;</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center">78.9</td>
<td valign="bottom" align="center">63.7</td>
<td valign="bottom" align="center">71.5</td>
<td valign="bottom" align="center">39.8</td>
<td valign="bottom" align="center">97.3</td>
<td valign="bottom" align="center">94.4</td>
<td valign="bottom" align="center">97.2</td>
<td valign="bottom" align="center">78.6</td>
<td valign="bottom" align="center">
<bold>2.6</bold>
</td>
<td valign="bottom" align="center">
<bold>7.0</bold>
</td>
</tr>
<tr>
<td valign="bottom" align="left">3</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="left">&#x2713;</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center">80.0</td>
<td valign="bottom" align="center">65.4</td>
<td valign="bottom" align="center">72.8</td>
<td valign="bottom" align="center">40.6</td>
<td valign="bottom" align="center">97.1</td>
<td valign="bottom" align="center">94.8</td>
<td valign="bottom" align="center">97.4</td>
<td valign="bottom" align="center">78.8</td>
<td valign="bottom" align="center">2.8</td>
<td valign="bottom" align="center">7.6</td>
</tr>
<tr>
<td valign="bottom" align="left">4</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="left">&#x2713;</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center">81.5</td>
<td valign="bottom" align="center">64.5</td>
<td valign="bottom" align="center">72.5</td>
<td valign="bottom" align="center">40.3</td>
<td valign="bottom" align="center">97.0</td>
<td valign="bottom" align="center">95.2</td>
<td valign="bottom" align="center">97.6</td>
<td valign="bottom" align="center">79.1</td>
<td valign="bottom" align="center">3.1</td>
<td valign="bottom" align="center">8.1</td>
</tr>
<tr>
<td valign="bottom" align="left">5</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="left">&#x2713;</td>
<td valign="bottom" align="center">81.1</td>
<td valign="bottom" align="center">64.8</td>
<td valign="bottom" align="center">72.3</td>
<td valign="bottom" align="center">40.8</td>
<td valign="bottom" align="center">97.1</td>
<td valign="bottom" align="center">94.4</td>
<td valign="bottom" align="center">96.8</td>
<td valign="bottom" align="center">78.9</td>
<td valign="bottom" align="center">3.0</td>
<td valign="bottom" align="center">7.7</td>
</tr>
<tr>
<td valign="bottom" align="left">6</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center">80.3</td>
<td valign="bottom" align="center">65.1</td>
<td valign="bottom" align="center">72.6</td>
<td valign="bottom" align="center">40.7</td>
<td valign="bottom" align="center">96.8</td>
<td valign="bottom" align="center">94.9</td>
<td valign="bottom" align="center">97.1</td>
<td valign="bottom" align="center">79.2</td>
<td valign="bottom" align="center">2.9</td>
<td valign="bottom" align="center">7.5</td>
</tr>
<tr>
<td valign="bottom" align="left">7</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="bottom" align="center"/>
<td valign="bottom" align="center">81.4</td>
<td valign="bottom" align="center">65.4</td>
<td valign="bottom" align="center">73.8</td>
<td valign="bottom" align="center">41.5</td>
<td valign="bottom" align="center">97.3</td>
<td valign="bottom" align="center">95.1</td>
<td valign="bottom" align="center">
<bold>97.7</bold>
</td>
<td valign="bottom" align="center">79.5</td>
<td valign="bottom" align="center">2.8</td>
<td valign="bottom" align="center">7.8</td>
</tr>
<tr>
<td valign="bottom" align="left">8</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="top" align="left">&#x2713;</td>
<td valign="bottom" align="left">&#x2713;</td>
<td valign="bottom" align="center">
<bold>82.1</bold>
</td>
<td valign="bottom" align="center">
<bold>65.7</bold>
</td>
<td valign="bottom" align="center">
<bold>74.1</bold>
</td>
<td valign="bottom" align="center">
<bold>42.1</bold>
</td>
<td valign="bottom" align="center">
<bold>97.4</bold>
</td>
<td valign="bottom" align="center">
<bold>95.4</bold>
</td>
<td valign="bottom" align="center">97.5</td>
<td valign="bottom" align="center">
<bold>79.8</bold>
</td>
<td valign="bottom" align="center">2.7</td>
<td valign="bottom" align="center">7.2</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x2018;&#x2713;&#x2019; represents the introduced module; with the best results highlighted in bold.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The primary goal of the first set of experiments was to evaluate the detection capabilities of the original model. Subsequently, we conducted experiments to improve the model, both individually and collectively, using RepGhost, SPD-Conv, GFPN, and CLLAHead, to assess the effectiveness of these four enhancement techniques across the two datasets. The original model achieved precision rates of 79.7 and 96.3 on the URPC and Brackish datasets, respectively. After individually evaluating each improvement, we found that the introduction of the RepGhost module slightly reduced accuracy on the URPC dataset, but it significantly alleviated the issue of high model parameters and computational load. In the subsequent combined experiments, RepGhost had a positive impact on the performance of underwater object detection tasks. On the Brackish dataset, the RepGhost module improved both model accuracy and complexity. Additionally, we observed that GFPN resulted in noticeable performance gains on both datasets, further demonstrating the effectiveness and feasibility of GFPN&#x2019;s feature fusion approach for detecting small underwater objects.</p>
<p>Our proposed LFN-YOLO network achieved accuracy rates of 82.1% and 97.4% on the URPC and Brackish datasets, respectively, representing improvements of 2.4% and 1.1% compared to the original model. Furthermore, the parameter counts and GFLOPs were reduced by 15.6% and 19.1%, respectively. In addition, the network demonstrated an increase in recall and mAP@0.5 by 1.5% and 2.2% on the URPC dataset and by 1.3% and 0.6% on the Brackish dataset. These results highlight the adaptability and robustness of the LFN-YOLO model in different underwater environments. <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref> illustrates a comparison of the detection performance between the original YOLOv8 model and the improved LFN-YOLO model for underwater object detection.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Comparison of detection results of different methods <bold>(A, C, E, G)</bold> results from YOLOv8, and <bold>(B, D, F, H)</bold> results from LFN-YOLO.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g007.tif"/>
</fig>
<p>The interactions between these improvements are realized through their complementary characteristics and advantages, working synergistically to reinforce each other and ultimately form a collaborative network for underwater object detection tasks. To validate the effectiveness of the proposed model in enhancing underwater detection performance, we compared the mAP@0.5 and box loss fitting curves of LFN-YOLO and YOLOv8n over 150 training epochs, as shown in <xref ref-type="fig" rid="f8">
<bold>Figures&#xa0;8A-D</bold>
</xref>. Additionally, <xref ref-type="fig" rid="f8">
<bold>Figures&#xa0;8E, F</bold>
</xref> presents a detailed comparison of detection precision for various organisms in the dataset between LFN-YOLO and YOLOv8n. Experimental results demonstrate that LFN-YOLO significantly outperforms YOLOv8n in detecting small objects, such as echinus, scallops, jellyfish, and small fish. This demonstrates LFN-YOLO&#x2019;s exceptional ability to capture fine details and detect small-scale underwater organisms, which is crucial for achieving accurate underwater object detection.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>
<bold>(A, B)</bold> represent the fitting curves during training on the URPC dataset, while <bold>(C, D)</bold> show the fitting curves for the Brackish dataset. <bold>(E, F)</bold> respectively illustrate the accuracy comparison for different species in the URPC and Brackish datasets.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g008.tif"/>
</fig>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>Comparative experiments</title>
<p>To further demonstrate that the LFN-YOLO model achieves a better balance between model complexity and accuracy, we conducted comparative experiments with eight other mainstream one-stage detection models. All experiments were carried out under the same settings, evaluating the models based on accuracy, recall, mAP@0.5, number of parameters, computational complexity, and model size. The detailed results are presented in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. All the algorithms compared in the experiments meet the real-time monitoring requirements and LFN-YOLO exhibits higher detection accuracy while maintaining a lighter model complexity.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison experiments of LFN-YOLO on URPC and Brackish datasets, with the best results highlighted in bold, and the second-best results highlighted in red.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" rowspan="2" align="left">Model</th>
<th valign="top" colspan="5" align="center">URPC(%)</th>
<th valign="top" colspan="5" align="center">Brackish(%)</th>
<th valign="top" rowspan="2" align="center">Parameters/M</th>
<th valign="top" rowspan="2" align="center">GFLOPs</th>
<th valign="top" rowspan="2" align="center">Model size/Mb</th>
</tr>
<tr>
<th valign="top" align="center">P</th>
<th valign="top" align="center">R</th>
<th valign="top" align="center">mAP@0.5</th>
<th valign="top" align="center">mAP@0.5:0.95</th>
<th valign="top" align="center">FPS</th>
<th valign="top" align="center">P</th>
<th valign="top" align="center">R</th>
<th valign="top" align="center">mAP@0.5</th>
<th valign="top" align="center">mAP@0.5:0.95</th>
<th valign="top" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">YOLOv5n</td>
<td valign="top" align="center">77.9</td>
<td valign="top" align="center">64.3</td>
<td valign="top" align="center">71.8</td>
<td valign="top" align="center">39.7</td>
<td valign="top" align="center">44</td>
<td valign="top" align="center">96.1</td>
<td valign="top" align="center">94.1</td>
<td valign="top" align="center">96.9</td>
<td valign="top" align="center">77.3</td>
<td valign="top" align="center">51</td>
<td valign="top" align="center">2.7</td>
<td valign="top" align="center">7.8</td>
<td valign="top" align="center">5.0</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv6-N</td>
<td valign="top" align="center">78.0</td>
<td valign="top" align="center">62.1</td>
<td valign="top" align="center">70.7</td>
<td valign="top" align="center">39.0</td>
<td valign="top" align="center">57</td>
<td valign="top" align="center">96.6</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">96.4</td>
<td valign="top" align="center">76.1</td>
<td valign="top" align="center">61</td>
<td valign="top" align="center">4.5</td>
<td valign="top" align="center">11.9</td>
<td valign="top" align="center">8.3</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv7tiny</td>
<td valign="top" align="center">80.6</td>
<td valign="top" align="center">64.9</td>
<td valign="top" align="center">73.2</td>
<td valign="top" align="center">40.9</td>
<td valign="top" align="center">48</td>
<td valign="top" align="center">95.7</td>
<td valign="top" align="center">93.1</td>
<td valign="top" align="center">96.5</td>
<td valign="top" align="center">75.1</td>
<td valign="top" align="center">58</td>
<td valign="top" align="center">6.2</td>
<td valign="top" align="center">13.2</td>
<td valign="top" align="center">12.0</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8n</td>
<td valign="top" align="center">79.7</td>
<td valign="top" align="center">64.2</td>
<td valign="top" align="center">71.9</td>
<td valign="top" align="center">39.6</td>
<td valign="top" align="center">52</td>
<td valign="top" align="center">96.3</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">96.9</td>
<td valign="top" align="center">79.0</td>
<td valign="top" align="center">59</td>
<td valign="top" align="center">3.2</td>
<td valign="top" align="center">8.9</td>
<td valign="top" align="center">6.0</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv9t</td>
<td valign="top" align="center">80.2</td>
<td valign="top" align="center">64.6</td>
<td valign="top" align="center">73.1</td>
<td valign="top" align="center">41.1</td>
<td valign="top" align="center">37</td>
<td valign="top" align="center">97.3</td>
<td valign="top" align="center">95.0</td>
<td valign="top" align="center">97.5</td>
<td valign="top" align="center">78.5</td>
<td valign="top" align="center">39</td>
<td valign="top" align="center">
<bold>2.0</bold>
</td>
<td valign="top" align="center">7.9</td>
<td valign="top" align="center">8.8</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv10n</td>
<td valign="top" align="center">75.8</td>
<td valign="top" align="center">63.9</td>
<td valign="top" align="center">70.8</td>
<td valign="top" align="center">39.0</td>
<td valign="top" align="center">42</td>
<td valign="top" align="center">96.2</td>
<td valign="top" align="center">94.3</td>
<td valign="top" align="center">97.0</td>
<td valign="top" align="center">79.1</td>
<td valign="top" align="center">42</td>
<td valign="top" align="center">2.7</td>
<td valign="top" align="center">8.4</td>
<td valign="top" align="center">5.5</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv11n</td>
<td valign="top" align="center">78.9</td>
<td valign="top" align="center">65.2</td>
<td valign="top" align="center">72.1</td>
<td valign="top" align="center">41.1</td>
<td valign="top" align="center">54</td>
<td valign="top" align="center">96.9</td>
<td valign="top" align="center">94.0</td>
<td valign="top" align="center">97.2</td>
<td valign="top" align="center">78.2</td>
<td valign="top" align="center">60</td>
<td valign="top" align="center">2.6</td>
<td valign="top" align="center">
<bold>6.5</bold>
</td>
<td valign="top" align="center">5.3</td>
</tr>
<tr>
<td valign="top" align="left">RetinaNet</td>
<td valign="top" align="center">75.2</td>
<td valign="top" align="center">66.8</td>
<td valign="top" align="center">73.4</td>
<td valign="top" align="center">41.4</td>
<td valign="top" align="center">18</td>
<td valign="top" align="center">94.8</td>
<td valign="top" align="center">94.6</td>
<td valign="top" align="center">96.5</td>
<td valign="top" align="center">75.9</td>
<td valign="top" align="center">22</td>
<td valign="top" align="center">36.2</td>
<td valign="top" align="center">206.0</td>
<td valign="top" align="center">80.1</td>
</tr>
<tr>
<td valign="top" align="left">RT-DETR</td>
<td valign="top" align="center">74.3</td>
<td valign="top" align="center">61.3</td>
<td valign="top" align="center">68.0</td>
<td valign="top" align="center">38.6</td>
<td valign="top" align="center">20</td>
<td valign="top" align="center">97.8</td>
<td valign="top" align="center">94.8</td>
<td valign="top" align="center">97.2</td>
<td valign="top" align="center">79.3</td>
<td valign="top" align="center">22</td>
<td valign="top" align="center">32.8</td>
<td valign="top" align="center">109.0</td>
<td valign="top" align="center">63.4</td>
</tr>
<tr>
<td valign="top" align="left">SSD</td>
<td valign="top" align="center">
<bold>84.4</bold>
</td>
<td valign="top" align="center">
<bold>68.4</bold>
</td>
<td valign="top" align="center">
<bold>75.4</bold>
</td>
<td valign="top" align="center">
<bold>42.3</bold>
</td>
<td valign="top" align="center">51</td>
<td valign="top" align="center">92.5</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">95.8</td>
<td valign="top" align="center">76.4</td>
<td valign="top" align="center">57</td>
<td valign="top" align="center">26.4</td>
<td valign="top" align="center">116.2</td>
<td valign="top" align="center">92.1</td>
</tr>
<tr>
<td valign="top" align="left">LFN-YOLO</td>
<td valign="top" align="center">82.1</td>
<td valign="top" align="center">65.7</td>
<td valign="top" align="center">74.1</td>
<td valign="top" align="center">42.1</td>
<td valign="top" align="center">
<bold>58</bold>
</td>
<td valign="top" align="center">
<bold>97.4</bold>
</td>
<td valign="top" align="center">
<bold>95.4</bold>
</td>
<td valign="top" align="center">
<bold>97.5</bold>
</td>
<td valign="top" align="center">
<bold>79.8</bold>
</td>
<td valign="top" align="center">
<bold>63</bold>
</td>
<td valign="top" align="center">2.7</td>
<td valign="top" align="center">7.2</td>
<td valign="top" align="center">5.7</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Based on the experimental results in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>, LFN-YOLO demonstrates outstanding performance on both the URPC and Brackish datasets. On the URPC dataset, LFN-YOLO achieved an mAP@0.5 of 74.1%, surpassing YOLOv5n, YOLOv6-N, YOLOv8n, YOLOv10n, and YOLOv11 by 2.3%, 3.4%, 2.2%, 3.3%, and 0.5%, respectively. For the more rigorous mAP@0.5:0.95 evaluation metric, LFN-YOLO remains the best among the YOLO series models, showcasing its exceptional capability in detecting underwater targets. Additionally, LFN-YOLO achieves an inference speed of 58 FPS, highlighting its significant advantage in real-time detection tasks. In contrast, the SSD model, with VGG-16 as its backbone, demonstrated the best performance on the URPC dataset, owing to the unique distribution of objects and scene characteristics in the URPC dataset.; however, its poor performance on the Brackish dataset reveals a lack of generalization and robustness, which are essential qualities for underwater object detection models. Moreover, SSD, being relatively large among one-stage algorithms, is not suitable for underwater target recognition tasks on unmanned platforms. Notably, among the one-stage algorithms included in our comparative experiments, only RetinaNet requires higher hardware performance to meet the real-time demands of underwater detection tasks, as its lower FPS makes it unsuitable for real-time underwater target detection.</p>
<p>
<xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref> presents a comparison of the detection results for underwater objects using the seven models with the best performance from <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. We used the same RGB color for the detection boxes of all YOLO series algorithms for easier comparison. YOLOv5n exhibited lower confidence scores for detected objects, YOLOv11n showed instances of missed detections, while YOLOv7tiny, YOLOv8n, and SSD suffered from false positives. YOLOv9t experienced both false positives and missed detections. In contrast, LFN-YOLO not only accurately identified the underwater objects but also achieved higher confidence scores. Among the networks compared, LFN-YOLO boasts the most lightweight structure.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>
<bold>(A&#x2013;H)</bold> Presentation of detection results from seven advanced models.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g009.tif"/>
</fig>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>Model edge deployment</title>
<p>With the increasing computational capabilities of edge deployment devices, deep learning-based object detection tasks are now more feasible. In this study, we deploy LFN-YOLO on the NVIDIA Jetson AGX Orin edge computing device to further validate its applicability in underwater platforms. To this end, we selected fish fry as the target for recognition, with the recognition scenario being a 200L fish tank. Our algorithm is deployed and tested underwater, as shown in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10A</bold>
</xref>, which illustrates the hardware connection diagram.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>
<bold>(A)</bold> is the hardware connection diagram. <bold>(B, C)</bold> present the recognition results of the LFNYOLO deployment.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g010.tif"/>
</fig>
<p>First, we trained both the LFN-YOLO and YOLOv8n models using the Brackish dataset on a PC and transferred the trained models to the project directory on the NVIDIA Jetson AGX Orin. Subsequently, an underwater camera was placed in the tank and connected to the RJ45 port of the NVIDIA Jetson AGX Orin, and an external monitor was connected via the HDMI port on the Jetson AGX Orin. Finally, a pre-written Python script was used to capture the underwater camera feed as network input, run predictions, and display real-time data. The real-time detection results of LFN-YOLO are shown in <xref ref-type="fig" rid="f10">
<bold>Figures&#xa0;10B, C</bold>
</xref>.</p>
<p>To evaluate the practical applicability of LFN-YOLO, we compared its detection performance against that of YOLOv8n. As illustrated in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>, YOLOv8n exhibited notable deficiencies, including frequently missed detections and low confidence scores. In contrast, LFN-YOLO demonstrated superior performance, particularly in terms of real-time detection speed, where it outpaced YOLOv8n by a substantial margin. These results demonstrate that our method performs stably in underwater environments, enabling real-time detection and recognition of underwater targets, thus validating the effectiveness and practicality of the algorithm.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>
<bold>(A, B)</bold> Comparison of recognition results for edge deployment.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g011.tif"/>
</fig>
</sec>
</sec>
<sec id="s6" sec-type="discussion">
<label>6</label>
<title>Discussion</title>
<p>The LFN-YOLO model excels in underwater object detection, particularly in terms of accuracy and lightweight design. This can be attributed to our specially designed deep learning-based object detection network, which seamlessly integrates various modules to address the inherent challenges of the underwater environment. Importantly, the architecture of the LFN-YOLO network was not specifically designed for our experimental dataset, highlighting the network&#x2019;s broad applicability and strong generalization capabilities in underwater tasks. To further evaluate its effectiveness, we compared LFN-YOLO with other leading object detection algorithms using the TrashCan dataset (<xref ref-type="bibr" rid="B13">Hong et&#xa0;al., 2020</xref>). This dataset comprises 7,212 underwater images, featuring 22 categories of underwater objects such as debris, ROVs, and various marine species. The images were sourced from the J-EDI (JAMSTEC E-library of Deep-sea Images), managed by the Japan Agency for Marine-Earth Science and Technology (JAMSTEC).</p>
<p>
<xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref> presents the comprehensive experimental results of LFN-YOLO compared to other models on the TrashCan dataset. Our model achieved a mAP@0.5 of 66.2%, demonstrating a significant advantage in accuracy over other algorithms, while also being more lightweight in terms of model parameters and GFLOPs compared to many real-time detection algorithms. <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref> shows the detection results of LFN-YOLO across four different underwater environments in the TrashCan dataset. In particularly challenging conditions where human vision struggles to discern objects that blend almost seamlessly with their surroundings, LFN-YOLO can accurately and effectively detect these targets. As illustrated in <xref ref-type="fig" rid="f12">
<bold>Figures&#xa0;12A, B</bold>
</xref>, even in scenarios with extremely low contrast and densely clustered small objects, the model successfully identifies underwater targets. Similarly, in <xref ref-type="fig" rid="f12">
<bold>Figures&#xa0;12C, D</bold>
</xref>, LFN-YOLO demonstrates strong robustness in detecting objects in complex backgrounds with occlusions.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Experimental results of LFN-YOLO and other object detection models on the TrashCan dataset, with the best results highlighted in bold.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Parameters/M</th>
<th valign="top" align="center">GFLOPs</th>
<th valign="top" align="center">mAP@0.5/%</th>
<th valign="top" align="center">mAP@0.5:0.95/%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Faster R-CNN</td>
<td valign="top" align="center">41.4</td>
<td valign="top" align="center">135</td>
<td valign="top" align="center">55.3</td>
<td valign="top" align="center">38.2</td>
</tr>
<tr>
<td valign="top" align="left">RT-DETR</td>
<td valign="top" align="center">32.8</td>
<td valign="top" align="center">109</td>
<td valign="top" align="center">61.4</td>
<td valign="top" align="center">44.2</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5n</td>
<td valign="top" align="center">2.7</td>
<td valign="top" align="center">7.8</td>
<td valign="top" align="center">61.7</td>
<td valign="top" align="center">43.7</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv6-N</td>
<td valign="top" align="center">4.5</td>
<td valign="top" align="center">11.9</td>
<td valign="top" align="center">58.8</td>
<td valign="top" align="center">41.6</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv7-tiny</td>
<td valign="top" align="center">6.2</td>
<td valign="top" align="center">13.2</td>
<td valign="top" align="center">65.9</td>
<td valign="top" align="center">45.0</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8n</td>
<td valign="top" align="center">3.2</td>
<td valign="top" align="center">8.9</td>
<td valign="top" align="center">64.1</td>
<td valign="top" align="center">45.8</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv9t</td>
<td valign="top" align="center">
<bold>2.0</bold>
</td>
<td valign="top" align="center">7.9</td>
<td valign="top" align="center">63.9</td>
<td valign="top" align="center">45.6</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv10n</td>
<td valign="top" align="center">2.7</td>
<td valign="top" align="center">8.4</td>
<td valign="top" align="center">61.0</td>
<td valign="top" align="center">43.6</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv11n</td>
<td valign="top" align="center">2.6</td>
<td valign="top" align="center">
<bold>6.5</bold>
</td>
<td valign="top" align="center">63.4</td>
<td valign="top" align="center">45.3</td>
</tr>
<tr>
<td valign="top" align="left">SSD</td>
<td valign="top" align="center">26.3</td>
<td valign="top" align="center">116.2</td>
<td valign="top" align="center">58.1</td>
<td valign="top" align="center">40.4</td>
</tr>
<tr>
<td valign="top" align="left">LFN-YOLO</td>
<td valign="top" align="center">2.7</td>
<td valign="top" align="center">7.2</td>
<td valign="top" align="center">
<bold>66.2</bold>
</td>
<td valign="top" align="center">
<bold>47.1</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>
<bold>(A&#x2013;D)</bold> Detection results of LFN-YOLO in four different underwater environments.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g012.tif"/>
</fig>
<p>Although our model has made significant progress in advancing the deployment of underwater object detection systems, LFN-YOLO still faces challenges with false positives and missed detections in highly variable underwater environments. As shown in <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref>, using the URPC dataset as an example, LFN-YOLO struggles with small object detection in complex backgrounds due to limitations in feature extraction, leading to missed detections. Additionally, under low-resolution conditions, such as those represented by the Brackish dataset, small object detection is easily affected by occlusion and insufficient resolution, resulting in inaccurate localization. Furthermore, in scenarios with large variations in object scale, such as the TrashCan dataset, LFN-YOLO still needs improvement in detecting targets with significant scale changes in underwater images.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Presentation of representative failure cases. Panels <bold>(A, B)</bold> show missed detections in complex backgrounds from the URPC dataset. Panels <bold>(C, D)</bold> illustrate misdetections in low-resolution images from the Brackish dataset. Panels <bold>(E, F)</bold> depict missed and misdetections due to significant scale variations in the TrashCan dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1513740-g013.tif"/>
</fig>
</sec>
<sec id="s7" sec-type="conclusions">
<label>7</label>
<title>Conclusion</title>
<p>The detection of small underwater organisms is of great significance for marine life sciences and resource exploration. This paper proposes a lightweight underwater object detection model based on deep learning, which achieves both lightweight design and high accuracy while demonstrating excellent generalization and robustness, essential qualities of a strong model. Firstly, the model introduces a lightweight re-parameterization technique, RepGhost, to achieve feature reuse, reduce the number of parameters, and improve both training efficiency and inference speed, minimizing the accuracy loss while maintaining a lightweight backbone network. The feature extraction network is further enhanced by incorporating SPD-Conv convolution modules, which improves the effective extraction of small object features. Secondly, to address challenges such as small object size, dense distribution, and blurry imaging in underwater visible light conditions, we propose a GFPN (General Feature Pyramid Network) for feature fusion, enabling effective extraction of features across varying object scales. Finally, cross-layer local attention mechanisms are added to the detection head to reduce unnecessary computations and enhance model robustness. A DFL (Distribution Focal Loss) is also introduced to minimize regression and classification losses. LFN-YOLO achieves strong detection results on the URPC, Brackish, and TrashCan datasets, with mAP@0.5 scores of 82.2%, 97.5%, and 66.2%, respectively, improving upon YOLOv8 by 2.6%, 1.2%, and 2.1%. Meanwhile, the model reduces parameters and GFLOPs by 15.6% and 19.1%, meeting the requirements for both lightweight design and high precision. This makes it suitable for small underwater object detection and marine species diversity surveys. In the future, we will explore underwater multi-source information fusion, specifically by integrating underwater visible light images with various underwater sensors, such as sonar, to enable the model to perform underwater exploration tasks in low-light or no-light conditions. This approach aims to further enhance the model&#x2019;s generalization capability and adaptability to diverse environments. At the same time, we will optimize the model end-to-end to improve its real-time detection capabilities. This will not only assist researchers in conducting more efficient marine resource surveys but also provide robust technological support for underwater ecological conservation.</p>
</sec>
</body>
<back>
<sec id="s8" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>ML: Conceptualization, Funding acquisition, Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. YW: Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. RL: Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. CL: Conceptualization, Project administration, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s10" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was partly supported by the National Natural Science Foundation of China (62171143), Guangdong Provincial University Innovation Team (2023KCXTD016), special projects in key fields of ordinary universities in Guangdong, Province (2021ZDZX1060), the Stable Supporting Fund of Acoustic Science and Technology Laboratory (JCKYS2024604SSJS00301), the Undergraduate Innovation Team Project of Guangdong Ocean University under Grant CXTD2024011, the Open Fund of Guangdong Provincial Key Laboratory of Intelligent Equipment for South China Sea Marine Ranching (Grant NO. 2023B1212030003).</p>
</sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s12" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec id="s13" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Uav remote sensing detection of tea leaf blight based on ddma-yolo</article-title>. <source>Comput. Electron. Agric.</source> <volume>205</volume>, <elocation-id>107637</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107637</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Repghost: A hardware-efficient ghost module via re-parameterization</article-title>. Available online at: <uri xlink:href="https://arxiv.org/abs/2211.06088">https://arxiv.org/abs/2211.06088</uri>.</citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kao</surname> <given-names>S.-h.</given-names>
</name>
<name>
<surname>He</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhuo</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>C.-H.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>a). &#x201c;<article-title>Run, don&#x2019;t walk: Chasing higher flops for faster neural networks</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>12021</fpage>&#x2013;<lpage>12031</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.01157</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>b). <article-title>Info-fpn: An informative feature pyramid network for object detection in remote sensing images</article-title>. <source>Expert Syst. Appl.</source> <volume>214</volume>, <elocation-id>119132</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2022.119132</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>a). <article-title>Towards large-scale small object detection: Survey and benchmarks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>45</volume>, <fpage>13467</fpage>&#x2013;<lpage>13488</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2023.3290594</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2023</year>b). <article-title>Drone detection method based on mobilevit and ca-panet</article-title>. <source>Electronics</source> <volume>12</volume>, <fpage>223</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics12010223</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>R-fcn: Object detection via region-based fully convolutional networks</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems</source>, vol. <volume>29</volume> . Eds. <person-group person-group-type="editor">
<name>
<surname>Lee</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Sugiyama</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Luxburg</surname> <given-names>U.</given-names>
</name>
<name>
<surname>Guyon</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Garnett</surname> <given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>Red Hook, NY, USA</publisher-loc>: <publisher-name>Curran Associates, Inc</publisher-name>).</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Er</surname> <given-names>M. J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Research challenges, recent advances, and popular datasets in deep learning-based underwater marine object detection: A review</article-title>. <source>Sensors</source> <volume>23</volume>, <fpage>1990</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s23041990</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Ceh-yolo: A composite enhanced yolo-based model for underwater object detection</article-title>. <source>Ecol. Inf.</source> <volume>82</volume>, <elocation-id>102758</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2024.102758</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grip</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Blomqvist</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Marine nature conservation and conflicts with fisheries</article-title>. <source>Ambio</source> <volume>49</volume>, <fpage>1328</fpage>&#x2013;<lpage>1340</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s13280-019-01279-7</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Ghostnet: More features from cheap operations</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, New Jersey, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1577</fpage>&#x2013;<lpage>1586</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00165</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Gkioxari</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Dollar</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Mask r-cnn</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE International Conference on Computer Vision (ICCV)</conf-name>. (<publisher-loc>Piscataway, New Jersey, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>).</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Fulton</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sattar</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Trashcan: A semantically-segmented dataset towards visual detection of marine debris</article-title>. Available online at: <uri xlink:href="https://arxiv.org/abs/2007.08097">https://arxiv.org/abs/2007.08097</uri>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Howard</surname> <given-names>A. G.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Mobilenets: Efficient convolutional neural networks for mobile vision applications</article-title>. <source>arXiv preprint arXiv:1704.04861</source>.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Howard</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Searching for mobilenetv3</article-title>. <source>Proc. IEEE/CVF Int. Conf. Comput. Vision (ICCV)</source>, <fpage>1314</fpage>&#x2013;<lpage>1324</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV43118.2019</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jian</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Underwater image processing and analysis: A review</article-title>. <source>Signal Process.: Image Commun.</source> <volume>91</volume>, <elocation-id>116088</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.image.2020.116088</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Giraffedet: A heavy-neck paradigm for object detection</article-title>. Available online at: <uri xlink:href="https://arxiv.org/abs/2202.04256">https://arxiv.org/abs/2202.04256</uri>.</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Krishna</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Jawahar</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Improving small object detection</article-title>,&#x201d; in <conf-name>2017 4th IAPR Asian Conference on Pattern Recognition (ACPR)</conf-name>. (<publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>340</fpage>&#x2013;<lpage>345</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACPR.2017.149</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Generalized focal loss: Towards efficient representation learning for dense object detection</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>45</volume>, <fpage>3139</fpage>&#x2013;<lpage>3153</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2022.3180392</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Dtcnet: Transformer-cnn distillation for super-resolution of remote sensing image</article-title>. <source>IEEE J. Select. Topics Appl. Earth Observ. Remote Sens.</source> <volume>17</volume>, <fpage>11117</fpage>&#x2013;<lpage>11133</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSTARS.2024.3409808</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Anguelov</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Erhan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Szegedy</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Reed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>C.-Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). &#x201c;<article-title>Ssd: Single shot multibox detector</article-title>,&#x201d; in <source>Computer Vision &#x2013; ECCV 2016</source>. Eds. <person-group person-group-type="editor">
<name>
<surname>Leibe</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Matas</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sebe</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Welling</surname> <given-names>M.</given-names>
</name>
</person-group> (<publisher-name>Springer International Publishing</publisher-name>, <publisher-loc>Cham</publisher-loc>), <fpage>21</fpage>&#x2013;<lpage>37</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-46448-02</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Hou</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>b). <article-title>A deep learning approach for object detection of rockfish in challenging underwater environments</article-title>. <source>Front. Mar. Sci.</source> <volume>10</volume>, <elocation-id>1242041</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fmars.2023.1242041</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>A dataset and benchmark of underwater object detection for robot picking</article-title>,&#x201d; in <conf-name>2021 IEEE International Conference on Multimedia &amp; Expo Workshops (ICMEW)</conf-name>. (<publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICMEW53276.2021.9455997</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2023</year>a). <article-title>Underwater target detection based on improved yolov7</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>11</volume>, <fpage>677</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse11030677</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname> <given-names>P.</given-names>
</name>
<name>
<surname>He</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Isod: Improved small object detection based on extended scale feature pyramid network</article-title>. <source>Visual Comput.</source> <volume>40</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00371-024-03341-2</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Minaee</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Boykov</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Porikli</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Plaza</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kehtarnavaz</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Terzopoulos</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Image segmentation using deep learning: A survey</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>44</volume>, <fpage>3523</fpage>&#x2013;<lpage>3542</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2021.3059968</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Pedersen</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Bruslund Haurum</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Gade</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Moeslund</surname> <given-names>T. B.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Detection of marine animals in a new underwater dataset with varying visibility</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops</conf-name>. (<publisher-loc>Long Beach, California, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>18</fpage>&#x2013;<lpage>26</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: Unified, real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Faster r-cnn: Towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>39</volume>, <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ross</surname> <given-names>T.-Y.</given-names>
</name>
<name>
<surname>Dollar</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Focal loss for dense object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2980</fpage>&#x2013;<lpage>2988</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhmoginov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Mobilenetv2: Inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2018.00474</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Hua</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Using lightweight deep learning algorithm for real-time detection of apple flowers in natural environments</article-title>. <source>Comput. Electron. Agric.</source> <volume>207</volume>, <elocation-id>107765</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107765</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sunkara</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>No more strided convolutions or pooling: A new cnn building block for lowresolution images and small objects</article-title>,&#x201d; in <source>Machine Learning and Knowledge Discovery in Databases</source>. Eds. <person-group person-group-type="editor">
<name>
<surname>Amini</surname> <given-names>M.-R.</given-names>
</name>
<name>
<surname>Canu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Fischer</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Guns</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Novak</surname> <given-names>P.K.</given-names>
</name>
<name>
<surname>Tsoumakas</surname> <given-names>G.</given-names>
</name>
</person-group> (<publisher-name>Springer Nature Switzerland</publisher-name>, <publisher-loc>Cham</publisher-loc>), <fpage>443</fpage>&#x2013;<lpage>459</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Tan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Le</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Efficientnet: Rethinking model scaling for convolutional neural networks</article-title>,&#x201d; in <source>Proceedings of the 36th International Conference on Machine Learning</source>, vol. <volume>97</volume> . Eds. <person-group person-group-type="editor">
<name>
<surname>Chaudhuri</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Salakhutdinov</surname> <given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>6105</fpage>&#x2013;<lpage>6114</lpage>.</citation>
</ref>
<ref id="B35">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Class: Cross-level attention and supervision for salient objects detection</article-title>,&#x201d; in <conf-name>Proceedings of the Asian Conference on Computer Vision (ACCV)</conf-name>. (<publisher-loc>Heidelberg, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>).</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tong</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Deep learning-based detection from the perspective of small or tiny objects: A survey</article-title>. <source>Image Vision Comput.</source> <volume>123</volume>, <elocation-id>104471</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.imavis.2022.104471</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Xia</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Underwater object detection based on enhanced yolo</article-title>,&#x201d; in <conf-name>2022 International Conference on Image Processing and Media Computing (ICIPMC)</conf-name>. (<publisher-loc>New York, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>17</fpage>&#x2013;<lpage>21</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICIPMC55686.2022.00012</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Bai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A reinforcement learning paradigm of configuring visual enhancement for object detection in underwater scenes</article-title>. <source>IEEE J. Ocean. Eng.</source> <volume>48</volume>, <fpage>443</fpage>&#x2013;<lpage>461</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JOE.2022.3226202</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Self-organized underwater image enhancement</article-title>. <source>ISPRS J. Photogram. Remote Sens.</source> <volume>215</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isprsjprs.2024.06.019</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A dual-path feature reuse multi-scale network for remote sensing image super-resolution</article-title>. <source>J. Supercomput.</source> <volume>81</volume>, <fpage>1</fpage>&#x2013;<lpage>28</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11227-024-06569-w</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>a). &#x201c;<article-title>Gfspp-yolo: A light yolo model based on group fast spatial pyramid pooling</article-title>,&#x201d; in <conf-name>2023 IEEE 11th International Conference on Information, Communication and Networks (ICICN)</conf-name>. (<publisher-loc>New York, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>733</fpage>&#x2013;<lpage>738</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICICN59530.2023.10393445</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>b). <article-title>Damo-yolo: A report on real-time object detection design</article-title>. Available online at: <uri xlink:href="https://arxiv.org/abs/2211.15444">https://arxiv.org/abs/2211.15444</uri>.</citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Dang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>Dual adversarial contrastive learning for underwater image enhancement</article-title>,&#x201d; in <conf-name>2023 2nd International Conference on Image Processing and Media Computing (ICIPMC)</conf-name>. (<publisher-loc>New York, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICIPMC58929.2023.00008</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Yolo-drone: An optimized yolov8 network for tiny uav object detection</article-title>. <source>Electronics</source> <volume>12</volume>, <fpage>3664</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics12173664</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>b). <article-title>Underwater image color correction via color channel transfer</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>21</volume>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2023.3344630</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>a). <article-title>Efficient small-object detection in underwater images using the enhanced yolov8 network</article-title>. <source>Appl. Sci.</source> <volume>14</volume>, <fpage>1095</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app14031095</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yun</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Yolov7-chs: An emerging model for underwater object detection</article-title>. <source>J. Mar. Sci. Eng.</source> <volume>11</volume>, <fpage>1949</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jmse11101949</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Bu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Towards standardizing automated image analysis with artificial intelligence for biodiversity</article-title>. <source>Front. Mar. Sci.</source> <volume>11</volume>, <fpage>1949</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fmars.2024.1349705</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>