<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2026.1778155</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Maritime aerial object detection via dual-domain contrastive learning and low-light degradation enhancement</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Liu</surname><given-names>Liwen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Xu</surname><given-names>Jiayi</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Fu</surname><given-names>Gui</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2235967/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhang</surname><given-names>Xiaoqiang</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2068237/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname><given-names>You</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Fan</surname><given-names>Rong</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>College of Flight Technology, Civil Aviation Flight University of China</institution>, <city>Guanghan</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>College of Aviation Electronic and Electrical Engineering, Civil Aviation Flight University of China</institution>, <city>Chengdu</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Gui Fu, <email xlink:href="mailto:abyfugui@163.com">abyfugui@163.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-16">
<day>16</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1778155</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>16</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Liu, Xu, Fu, Zhang, Wang and Fan.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Liu, Xu, Fu, Zhang, Wang and Fan</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-16">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>The expansion of marine economic activities and the increasing demand for maritime security have positioned drone-based aerial object detection as a crucial technology for applications such as marine environmental monitoring and maritime law enforcement. However, maritime aerial imagery remains highly challenging due extreme illumination variations, the small size and indistinct appearance of targets. This paper introduces a novel dual-domain contrastive learning framework integrated with low-light degradation enhancement to address these challenges. First, a low-light degradation perception module performs illumination equalization, improving image uniformity under adverse lighting conditions. Then, a dual-domain contrastive learning strategy aligns representations across both image and feature domains, enabling the detection network to learn more discriminative features. Additionally, a Local Feature Embedding and Global Feature Extraction Module (LEGM) is incorporated into the detection network to enhance the representation of small-scale maritime targets. Experiments on SeaDronessee and AFO datasets demonstrate the superiority of the proposed approach, achieving an improvement of 4.3% in mAP@0.5 and 1.9% in mAP@0.5:0.95 on SeaDronesSee, 1.9% in mAP@0.5 and 1.1% in mAP@0.5:0.95 on AFO. These results confirm that the proposed method delivers robust and accurate maritime object detection under complex environmental conditions and has strong potential for deployment in real-world maritime surveillance applications.</p>
</abstract>
<kwd-group>
<kwd>contrastive learning</kwd>
<kwd>deep learning</kwd>
<kwd>low-light degradation enhancement</kwd>
<kwd>marine aerial object detection</kwd>
<kwd>unmanned aerial vehicle</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was funded by the Fund Project for Basic Scientific Research Expenses of Central Universities, grant number 25CAFUC03021, Natural Science Foundation of Sichuan Province, grant number 2024NSFSC0507 and Research on Laser Imaging Detection Technology for Integrated Land-Air UAVs, grant number JCKEYS2025411011.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="2"/>
<equation-count count="19"/>
<ref-count count="26"/>
<page-count count="12"/>
<word-count count="6581"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Solutions for Ocean and Coastal Systems</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>As a nation encompassing both continental and maritime territories, civilization has been molded by millennia of interactions and integration among maritime, agrarian, and nomadic cultures. These cultures have mutually complemented one another, thereby forming the bedrock of Chinese civilization. Currently, the proposition and implementation of concepts such as the construction of a maritime power, the joint promotion of the Belt and Road Initiative (<xref ref-type="bibr" rid="B18">Wang and Liang, 2024</xref>), and the forging of a community with a shared future for mankind have presented maritime civilization with a significant historical opportunity for development. However, maritime development entails both opportunities and challenges. Globally, illegal fishing results in annual losses amounting to $23 billion. Oil spills wreak havoc on 54,000 km&#xb2; of marine ecosystems. Pirate attacks in regions like the Gulf of Aden are escalating at a rate of 17% per annum. Traditional manual patrols are plagued by extensive blind spots and delayed response times. Consequently, there is an urgent need for an intelligent maritime object monitoring system to tackle these issues.</p>
<p>As a versatile automated platform, unmanned aerial vehicles have incrementally evolved into indispensable instruments for maritime surveillance, owing to their extensive coverage and cost-effectiveness. In comparison to conventional satellite remote sensing and vessel-based patrols, UAVs are capable of carrying high-precision sensors and imaging apparatuses, such as high-resolution cameras and LiDAR, to undertake long-duration, large-scale missions. The sea-surface imagery acquired by UAVs offers shore-based personnel substantial data support, thereby augmenting their responsiveness to maritime emergencies.</p>
<p>However, aerial maritime imagery frequently encounters degraded quality and formidable detection challenges, attributed to sea-surface reflections, diminutive targets, and multi-scale variations. Although detection methods predicated on deep learning have emerged as the prevailing techniques with the progress of Convolutional Neural Networks (CNNs), the direct application of generic detection methods to UAV aerial maritime targets generally leads to a substantial decline in accuracy, encompassing severe omissions and false detection. Consequently, achieving precise detection in maritime images captured by UAV still represents a significant research direction within the field of object detection.</p>
<p>Current object detection algorithms driven by deep learning1, continue to evolve to balance accuracy and efficiency while advancing in multi-modality, lightweight design, and robustness. The field broadly classifies object detection frameworks into two-stage and single-stage architectures. Two-stage models, which are a dominant paradigm in object detection, operate by dividing the detection process into region proposal generation followed by object classification and localization. Evolutionary milestones include R-CNN (<xref ref-type="bibr" rid="B22">Yasir et&#xa0;al., 2024</xref>), followed by Fast R-CNN (<xref ref-type="bibr" rid="B3">Girshick, 2015</xref>), culminating in Faster R-CNN (<xref ref-type="bibr" rid="B14">Ren et&#xa0;al., 2016</xref>). Due to their reliance on pre-generated region proposals, these methods demand higher computational resources and exhibit slower detection speeds, making them unsuitable for maritime detection scenarios with limited computing resources and stringent real-time requirements. Single-stage models, such as the YOLO series (<xref ref-type="bibr" rid="B13">Redmon et&#xa0;al., 2016</xref>), SSD (<xref ref-type="bibr" rid="B21">Yang et&#xa0;al., 2024</xref>), and RetinaNet (<xref ref-type="bibr" rid="B6">Lin et&#xa0;al., 2017</xref>), adopt an end-to-end approach that directly predicts object locations and categories in a single forward pass. This streamlined process delivers faster inference and higher accuracy, rendering these algorithms ideal for real-time maritime detection. To address challenges in maritime target detection, researchers have proposed several solutions, which we categorize into the following three directions: 1.Lightweight Network Design Direction: For instance, <xref ref-type="bibr" rid="B24">Yue et&#xa0;al. (2021)</xref> integrated MobileNet v2 with YOLOv4 to develop a lightweight maritime detection network. Their approach employed sparse training on Batch Normalization scaling factors and channel pruning to eliminate redundant parameters. 2. Multi-scale Feature Direction: <xref ref-type="bibr" rid="B5">Hu et&#xa0;al. (2022)</xref> introduced a multi-scale anchor-free detection method using a Balanced Attention Network, enhancing detection of multi-scale maritime targets and nearshore vessels, though model generalization requires further improvement. <xref ref-type="bibr" rid="B17">Wang et&#xa0;al. (2023)</xref> proposed a multi-scale feature fusion network for SAR ship detection, which leverages contextual details to improve contour recognition and detection precision. 3.Attention Mechanism Enhancement Direction: Ma et&#xa0;al (<xref ref-type="bibr" rid="B10">Ma et&#xa0;al., 2024</xref>)designed a bidirectional coordinate attention mechanism to help networks focus on ship features while suppressing background noise. They further incorporated multi-resolution feature fusion to mitigate spatial information loss in small-scale vessels.</p>
<p>Although the aforementioned methods have targeted improvements for maritime object detection, detecting small maritime targets remains more challenging compared to standard-sized objects. Scholars generally identify small targets through relative scale or absolute scale criteria (<xref ref-type="bibr" rid="B1">Cheng et&#xa0;al., 2023</xref>): Relative scale defines small targets as those occupying less than 0.12% of the total image pixels. Absolute scale classifies targets with resolutions below 32&#xd7;32 pixels as small targets (<xref ref-type="bibr" rid="B7">Lin et&#xa0;al., 2014</xref>).</p>
<p>The key challenges that impede the performance of small-target detection encompass the following aspects:</p>
<list list-type="order">
<list-item>
<p>The scarcity of effective information resulting from low resolution;</p></list-item>
<list-item>
<p>Minimal pixel occupancy, which is further exacerbated by image quality degradation;</p></list-item>
<list-item>
<p>Interference stemming from complex environments (<xref ref-type="bibr" rid="B8">Liu et&#xa0;al., 2023</xref>) and the absence of contextual validation cues (<xref ref-type="bibr" rid="B9">Liu et&#xa0;al., 2022</xref>);</p></list-item>
<list-item>
<p>The prevalent problems of mutual occlusion (<xref ref-type="bibr" rid="B26">Zheng et&#xa0;al., 2022</xref>) and dense distribution (<xref ref-type="bibr" rid="B25">Zhang et&#xa0;al., 2021</xref>).</p></list-item>
</list>
<p>To address these challenges, prior studies have integrated contrastive learning with detection networks: <xref ref-type="bibr" rid="B20">Wang et&#xa0;al. (2021)</xref> aligned visual features by leveraging similar query text fragments from different videos within the same training batch. By mining co-occurring visually similar segments and integrating Noise Contrastive Estimation (NCE) loss, they learned discriminative features to mitigate the visual-textual semantic gap. <xref ref-type="bibr" rid="B4">Hsu et&#xa0;al. (2020)</xref> proposed a progressive adaptation method that employs an intermediate domain to bridge domain gaps, decomposing a challenging task into two simpler subtasks with reduced discrepancies. The intermediate domain is generated by transforming source-domain images into target-like images. This approach progressively addresses adaptation subtasks: first adapting from the source to the intermediate domain, then to the target domain. Additionally, a weighted loss is introduced in the second stage to balance varying image quality within the intermediate domain. This approach incrementally tackles adaptation subtasks: initially adapting from the source domain to the intermediate domain, and subsequently to the target domain. Moreover, a weighted loss function is incorporated in the second stage to balance the varying image qualities within the intermediate domain. The method further mitigates domain shifts across diverse scenarios, weather conditions, and large-scale datasets. Concurrently, addressing the small-sample degradation problem, Qian et&#xa0;al (<xref ref-type="bibr" rid="B12">Qian et&#xa0;al., 2025b</xref>). proposed an IWNC-based RUL prediction framework that addresses multi-parameter nonlinear degradation via an enhanced Wasserstein GAN, a nonlinear Wiener process, and a Copula function. Qian et&#xa0;al (<xref ref-type="bibr" rid="B11">Qian et&#xa0;al., 2025a</xref>). proposed an innovative adaptive data augmentation method for reliability assessment. By combining the nonlinear Wiener process with the AAM-GAN (Adaptive Augmentation Magnitude Generative Adversarial Network) algorithm, this approach dynamically expands the sample size, addressing the evaluation bias issues caused by insufficient samples in traditional methods.</p>
<p>Building on the aforementioned research, this paper introduces a dual-domain contrastive learning-guided framework tailored for low-light drone-view maritime detection. The framework integrates a dual-domain contrastive learning network as a standalone guidance module within the target detection network, facilitating knowledge transfer through joint optimization. Crucially, contrastive learning solely guides feature space alignment during the training phase, thereby eliminating any additional computational overhead during inference. This mechanism effectively reduces the computational burden while ensuring real-time performance in maritime target detection.</p>
<p>The contributions of this paper are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>A dual-domain contrastive learning framework is developed to jointly leverage frequency-domain global structural cues with spatial-domain local texture details. This framework effectively enhances feature discriminability while suppressing redundant information, thus reducing computational cost and satisfying real-time maritime target detection requirements.</p></list-item>
<list-item>
<p>A low-light enhancement module is proposed to address illumination imbalance in complex maritime environments. By equalizing brightness and improving image uniformity, the module increases feature separability between small targets and background clutter, thereby improving detection robustness under adverse lighting conditions.</p></list-item>
<list-item>
<p>A novel Locally Embedded Global Feature Extraction Module (LEGM) is designed to enhance multi-scale feature representation through the integrated modeling of local fine-grained structures and global contextual semantics. This design substantially alleviates the problems of missed detection and false positives for small maritime targets.</p></list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>The proposed algorithm</title>
<p>As shown in the overall process framework in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>. In marine environments, images from the UAV perspective often contain small targets, and feature map sizes gradually decrease with frequent down sampling during the object detection process, making feature extraction for small targets typically challenging. Therefore, this paper proposes an algorithm composed of an object detection network and a contrastive learning network. The network integrates spatial and frequency domain information, defined as dual-domain contrastive learning, which prompt the model to identify key characteristics of small targets and construct high-quality feature descriptors. However, in marine object detection, sea surface textures and water reflections often reduce detection accuracy. Thus, the proposed algorithm incorporates a low-light degradation enhancement algorithm before dual-domain contrastive learning. This enhancement algorithm improves image quality while simulating object detection in low-light conditions, providing higher-quality and more diverse training data for subsequent algorithms. Simultaneously, the LEGM module is introduced to optimize the network structure of the object detection framework. Through effective fusion of local and global context information, it strengthens the model&#x2019;s capacity to characterize blurred features and small-object features, thereby improving the model&#x2019;s flexibility when handling intricate tasks. Through the above processing, the proposed algorithm covers a complete improvement pipeline from feature representation to data enhancement and then to network structure, systematically addressing the limitations of object detection algorithms in marine environment.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Overall network architecture.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778155-g001.tif">
<alt-text content-type="machine-generated">Diagram depicting an object detection network architecture. The process begins with an input image, which undergoes low-light degradation enhancement, producing positive and negative samples. These samples are passed through a backbone, neck, and prediction layers. A dual-domain comparative learning network involves both frequency and spatial domains using Fourier Transform. Key elements include loss of characteristics, backpropagation, and a summary calculation of losses.</alt-text>
</graphic></fig>
<sec id="s2_1">
<label>2.1</label>
<title>Dual-domain contrastive learning</title>
<p>The dual-domain contrastive learning framework, illustrated in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>, employs separate encoders for spatial and frequency domains, Images from the datasets are first randomly cropped into rectangular patches based on their annotations. Among these, patches with red borders denoting ground-truth bounding boxes are defined as query instances(q).Positive and negative samples for contrastive learning are then selected by comparing against these queries according to specific Intersection-over-Crop (IoC) ratio thresholds.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Dual-domain comparative learning network architecture.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778155-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a process where an input image, featuring boats near a coastline, is enhanced for low-light conditions. The image is divided, with parts processed through frequency and spatial domain encoders. These involve Fourier transform and average pooling, leading to shared multilayer perceptrons (MLP) outputs. The final stage shows a vertical alignment of results labeled as \(X_{\text{query}}\) and \(X_{\text{key}}\).</alt-text>
</graphic></fig>
<p>Positive Sample (<inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msup><mml:mi>k</mml:mi><mml:mo>+</mml:mo></mml:msup></mml:mrow></mml:math></inline-formula>): Randomly generated crop regions are evaluated by calculating the IoC ratio&#x2014;the area of overlap with any ground-truth bounding box divided by the crop area. If this ratio exceeds 2/3, the region is cropped and designated as a positive sample, marked with a green border.</p>
<p>Negative Sample (<inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msup><mml:mi>k</mml:mi><mml:mo>&#x2212;</mml:mo></mml:msup></mml:mrow></mml:math></inline-formula>): Similarly, randomly generated crop regions are evaluated by calculating the IoC ratio against all ground-truth bounding boxes. If this ratio dose not exceeds 1/5, the region is cropped and designated as a negative sample, marked with a blue border.</p>
<p>All three types of rectangular patches are first enhanced to address low-light degradation and are then processed by structurally identical spatial and frequency-domain encoders, which consist of convolutional layers, average pooling layers, and MLPs. Following the MoCo framework (<xref ref-type="bibr" rid="B15">Tu et&#xa0;al., 2024</xref>), in the feature space, the feature representations of query and positive samples are made more similar, whereas the representations of query and negative samples are separated, the relationships within this feature space are determined by the loss function shown in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>.</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>f</mml:mi><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mfrac><mml:mrow><mml:mi>exp</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mo>+</mml:mo></mml:msub><mml:mo stretchy="false">/</mml:mo><mml:mi>&#x3c4;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mtext>m</mml:mtext></mml:munderover><mml:mrow><mml:mi>exp</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mo>&#x2212;</mml:mo></mml:msub><mml:mo stretchy="false">/</mml:mo><mml:mi>&#x3c4;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im3"><mml:mi>m</mml:mi></mml:math></inline-formula> stands for the quantity of negative samples, and adjusts the penalty intensity corresponding to these samples. A greater <inline-formula>
<mml:math display="inline" id="im4"><mml:mi>m</mml:mi></mml:math></inline-formula> will reduce the penalty&#x2019;s impact.</p>
<p>As a self-supervised learning strategy, contrastive learning optimizes feature space distances by attracting similar samples and repelling dissimilar ones. During training, the generated queries, positives, and negatives are fed into the spatial-domain encoder, which learns discrete feature representations. Given that objects in images may appear at diverse scales and proportions, the encoder captures scale-sensitive features (e.g., pixel distribution and relative distances) to infer actual object sizes and spatial locations. The spatial-domain encoder focuses on learning textural features of targets, enabling the network to acquire discriminative characteristics for effectively distinguishing object categories. Additionally, it extracts contextual information around targets, deepening the understanding of environmental contexts to reduce false positives and missed detection. Through this process, the semantic features acquired by the spatial encoder significantly enhance the detection ability to extract contextual information from image features, thereby improving target localization and classification precision.</p>
<p>These patches are also transformed via <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>. for frequency-domain analysis, enabling the extraction of deep semantic feature from spectral representations.</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:mrow><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle><mml:mo>&#xb7;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>j</mml:mi><mml:mn>2</mml:mn><mml:mi>&#x3c0;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mfrac><mml:mrow><mml:mi>u</mml:mi><mml:mi>x</mml:mi></mml:mrow><mml:mi>M</mml:mi></mml:mfrac><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>v</mml:mi><mml:mi>y</mml:mi></mml:mrow><mml:mi>N</mml:mi></mml:mfrac><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math>
</disp-formula>
<p>An image is represented in the frequency domain by the complex function <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, here <inline-formula>
<mml:math display="inline" id="im6"><mml:mi>u</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im7"><mml:mi>v</mml:mi></mml:math></inline-formula> are the horizontal and vertical spatial frequencies, while <inline-formula>
<mml:math display="inline" id="im8"><mml:mi>M</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im9"><mml:mi>N</mml:mi></mml:math></inline-formula> refer to the image&#x2019;s width and height, respectively.</p>
<p>High-frequency components, which capture fine details and textures, are extracted from the amplitude spectrum <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> via the filter specified in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>. Low-frequency components, representing global structures, are correspondingly defined by <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>.</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x2009;&#x2009;</mml:mtext><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x2009;&#x2009;&#x2009;</mml:mtext><mml:msqrt><mml:mrow><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mi>M</mml:mi><mml:mn>2</mml:mn></mml:mfrac><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>+</mml:mo><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mi>N</mml:mi><mml:mn>2</mml:mn></mml:mfrac><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msqrt><mml:mo>&#x2264;</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x2009;&#x2009;</mml:mtext><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>s</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:mi>A</mml:mi><mml:mo>(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> represent the amplitude spectrums corresponding to high-frequency components and the original image, and <inline-formula>
<mml:math display="inline" id="im13"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> refers to the cutoff radius for high-frequency signals.</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x2009;&#x2009;</mml:mtext><mml:msqrt><mml:mrow><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mi>M</mml:mi><mml:mn>2</mml:mn></mml:mfrac><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>+</mml:mo><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mi>N</mml:mi><mml:mn>2</mml:mn></mml:mfrac><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msqrt><mml:mo>&#x2264;</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#x2009;&#x2009;&#x2009;</mml:mtext><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>s</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> stands for the amplitude spectrum corresponding to low-frequency components, while <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represents the cutoff radius for low-frequency signals.</p>
<p>The frequency-domain processing branch leverages low-frequency components to extract global features that preserve fundamental structural information, even in blurred images. In&#xa0;contrast, high-frequency components correspond to areas with abrupt grayscale variations such as edges, fine details, textures, and high-frequency noise. By analyzing these high-frequency characteristics, the network learns discriminative patterns that effectively separate target objects from complex backgrounds. The integration of spatial and frequency domains thereby enriches feature representation, leading to improved robustness in low-light detection scenarios. This multi-domain fusion strategy not only enhances target recognition accuracy but also deepens the environmental context understanding, ultimately boosting detection performance across diverse and challenging conditions.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Low-light degradation enhancement</title>
<p>By applying low-light degradation processing to images, additional views are generated to enhance model robustness against low-quality inputs. As shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>, before dual-domain contrastive learning, low-light degradation is applied to positive sample images. Specifically, the image degradation is divided into two groups of interpolation degradation: the first group uses wavelet coefficient attenuation degradation, adaptive interpolation degradation, and bilinear interpolation degradation; the second group randomly selects one interpolation degradation method from the three interpolation-degraded images obtained from the first group for further degradation. Subsequently, the obtained degraded images are subjected to low-light darkening processing to simulate images under low-light conditions, improving the model&#x2019;s adaptability to scene variations.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Low-light degradation enhances network architecture.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778155-g003.tif">
<alt-text content-type="machine-generated">Flowchart illustrating an image enhancement process. It begins with an input image of a coastline with boats, proceeding to extraction of rectangular frames. These frames undergo degradation processes, including wavelet coefficient attenuation, adaptive interpolation, and bilinear interpolation. The degraded output is treated with low-light adjustment, followed by enhancement steps: stochastic affine transformation, HSV enhancement, and CutMix Blend Enhancement.</alt-text>
</graphic></fig>
<p>Wavelet coefficient attenuation degradation is an image degradation method based on wavelet transform. Its basic idea is to simulate the blurring or detail loss that may occur in the process of image acquisition or transmission by attenuating the high-frequency coefficients obtained after wavelet transform of the image. Specifically, an image is a typical two-dimensional signal. The image is subjected to the two-dimensional discrete wavelet transform of <xref ref-type="disp-formula" rid="eq5">Equation 5</xref> to obtain high-frequency component <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, which is often expressed as HH in the image; vertical component <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, expressed as HL in the image; horizontal component <inline-formula>
<mml:math display="inline" id="im18"><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, expressed as LH in the image; and low-frequency component <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, expressed as LL in the image.</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>g</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mover accent="true"><mml:mi>g</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mstyle><mml:msubsup><mml:mi>a</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>g</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mstyle><mml:msubsup><mml:mi>a</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mover accent="true"><mml:mi>g</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mstyle><mml:msubsup><mml:mi>a</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mover accent="true"><mml:mi>h</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mstyle><mml:msubsup><mml:mi>a</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>In the degradation process, the main focus is on attenuating the high-frequency coefficients (HH, HL, LH), as these coefficients contain the edge and texture details of the image, and the attenuation equation is shown in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mo>&#x2217;</mml:mo><mml:mi>l</mml:mi></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im20"><mml:mi>x</mml:mi></mml:math></inline-formula> is the coefficient after attenuation; <inline-formula>
<mml:math display="inline" id="im21"><mml:mi>c</mml:mi></mml:math></inline-formula> is the original coefficient; <inline-formula>
<mml:math display="inline" id="im22"><mml:mi>l</mml:mi></mml:math></inline-formula> is the attenuation factor, which is between 0 and 1. The smaller the attenuation factor, the more high-frequency information is lost, and the blurrier the image.</p>
<p>Adaptive interpolation degradation combines edges to dynamically select nearest neighbor interpolation degradation or bicubic interpolation degradation. Nearest neighbor interpolation degradation refers to selecting the nearest pixel as its pixel value. It is a simple and fast interpolation method. The core logic here is to first compute the scaling ratio using the dimensions of the original and target images; next, determine the original pixel corresponding to each target pixel via this scaling ratio, and map the original pixel&#x2019;s value to the target pixel. This approach benefits from low computational load and high processing speed, yet it suffers from inferior image output, which often resulting in mosaic-like textures and jagged edges. The coordinate transformation formula for nearest neighbor interpolation is shown in <xref ref-type="disp-formula" rid="eq7">Equations 7</xref>, <xref ref-type="disp-formula" rid="eq8">8</xref> (<xref ref-type="bibr" rid="B2">Ding et&#xa0;al., 2024a</xref>):</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mi>s</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mi>c</mml:mi><mml:mi>x</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>x</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mi>s</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mi>c</mml:mi><mml:mi>y</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:mi>s</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mi>c</mml:mi><mml:mi>x</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:mi>s</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mi>c</mml:mi><mml:mi>y</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represent the coordinate values corresponding to the mage, <inline-formula>
<mml:math display="inline" id="im25"><mml:mrow><mml:mi>d</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>x</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im26"><mml:mrow><mml:mi>d</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> stand for the coordinates in the target image, while <inline-formula>
<mml:math display="inline" id="im27"><mml:mrow><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:math></inline-formula> refers to the scaling factor.</p>
<p>Bicubic interpolation degradation performs cubic interpolation using the grayscale values of 16 neighboring points around a sampled pixel, simultaneously considering both the grayscale values of directly adjacent points and their rate of change. Its computational complexity is significantly higher than that of nearest-neighbor and bilinear interpolation, yet it yields optimal image quality. The mathematical principles are illustrated in <xref ref-type="disp-formula" rid="eq9">Equation 9</xref>.</p>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:mi>W</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x2009;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mi>a</mml:mi><mml:mo>+</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:msup><mml:mrow><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mn>3</mml:mn></mml:msup><mml:mo>&#x2212;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>a</mml:mi><mml:mo>+</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:msup><mml:mrow><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mtext>&#x2009;&#x2009;</mml:mtext><mml:mrow><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mo>&#x2264;</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x2009;</mml:mtext><mml:mi>a</mml:mi><mml:msup><mml:mrow><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mn>3</mml:mn></mml:msup><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn><mml:mi>a</mml:mi><mml:msup><mml:mrow><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>+</mml:mo><mml:mn>8</mml:mn><mml:mi>a</mml:mi><mml:mrow><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>4</mml:mn><mml:mi>a</mml:mi><mml:mtext>&#x2009;&#x2009;</mml:mtext><mml:mn>1</mml:mn><mml:mo>&lt;</mml:mo><mml:mrow><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mo>&lt;</mml:mo><mml:mn>2</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x2009;&#x2009;&#x2009;&#x2009;&#x2009;</mml:mtext><mml:mn>0</mml:mn><mml:mtext>&#x2009;&#x2009;&#x2009;&#x2009;&#x2009;&#x2009;&#x2009;&#x2009;&#x2009;&#x2009;</mml:mtext><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>w</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>Bilinear interpolation estimates pixel values at new positions by performing weighted averaging based on four adjacent pixels in both <inline-formula>
<mml:math display="inline" id="im28"><mml:mi>x</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im29"><mml:mi>y</mml:mi></mml:math></inline-formula> directions. While computationally more intensive than nearest-neighbor interpolation (<xref ref-type="bibr" rid="B2">Ding et&#xa0;al., 2024a</xref>), it yields superior image quality with reduced pixelation. The method presupposes known function values <inline-formula>
<mml:math display="inline" id="im30"><mml:mi>f</mml:mi></mml:math></inline-formula> at points <inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>11</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im32"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im33"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>21</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im34"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>22</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, with the interpolated value obtained through sequential interpolation operations.</p>
<p>Perform interpolation in the x-direction as shown in <xref ref-type="disp-formula" rid="eq10">Equations 10</xref>, <xref ref-type="disp-formula" rid="eq11">11</xref>.</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2248;</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>11</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>21</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2248;</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>22</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Perform interpolation in the y-direction as shown in <xref ref-type="disp-formula" rid="eq12">Equation 12</xref>.</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2248;</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>y</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Based on the above content, the conclusion can be drawn as shown in <xref ref-type="disp-formula" rid="eq13">Equation 13</xref>.</p>
<disp-formula id="eq13"><label>(13)</label>
<mml:math display="block" id="M13"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2248;</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>y</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mo stretchy="false">[</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>11</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>21</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>y</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mo stretchy="false">[</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mfrac><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>22</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math>
</disp-formula>
<p>Process the above-degraded image with darkness treatment to simulate low-light effects by reducing brightness and enhancing the contrast of dark areas. The key operational steps are: first, limit the intensity parameter to the 0.1&#x2013;0.9 range to prevent excessive brightness or full blackness; next, convert the image from RGB to HSV color space (HSV consists of components representing Hue, Saturation, and Value); extract the Value channel and multiply it by the intensity factor to reduce brightness. Meanwhile, to increase the contrast of dark areas, gamma correction is applied to the V channel, the calculation formula is shown in <xref ref-type="disp-formula" rid="eq14">Equation 14</xref>.</p>
<disp-formula id="eq14"><label>(14)</label>
<mml:math display="block" id="M14"><mml:mrow><mml:mi>&#x3b3;</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x3b3;</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1.0</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mtext>I</mml:mtext><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im37"><mml:mi>&#x3b3;</mml:mi></mml:math></inline-formula> represents the contrast modulation factor, <inline-formula>
<mml:math display="inline" id="im38"><mml:mrow><mml:msub><mml:mi>&#x3b3;</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> denotes the baseline contrast modulation factor, and <inline-formula>
<mml:math display="inline" id="im39"><mml:mi>I</mml:mi></mml:math></inline-formula> is the luminance intensity parameter.</p>
<p>Lastly, transform the processed HSV image back into the RGB color space.</p>
<p>The data augmentation strategy generates highly realistic synthetic samples to effectively expand the training set and approximate the real data distribution, this enables the model to acquire more robust feature descriptors and significantly enhance generalization capability. This study employs random affine transformations, HSV color enhancement, and CutMix blending augmentation to collaboratively optimize both original and degraded image data.</p>
<p>Random affine transformation refers to linear transformation plus translation transformation. The corresponding enhancement effect can be achieved by defining different transformation matrices M for different transformation methods. HSV enhancement realizes image enhancement by randomly adjusting hue, saturation, and brightness. CutMix-based data (<xref ref-type="bibr" rid="B23">Yuanbo, 2023</xref>) augmentation creates new training samples by cropping a segment from two randomly chosen images and exchanging these cropped parts. Specifically, first, two images are randomly selected, and a rectangular area is randomly cropped from each image; then, the cropped areas of the two images are swapped and merged into a new image; finally, the label of the resulting image is determined by computing a weighted average of the two original images&#x2019; labels, where weights are based on the cropped regions&#x2019; area.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Locally embedded global feature extraction module</title>
<p>For balancing detection precision and operational efficiency, this study selects the YOLOv8n framework. Specifically, its Backbone achieves lightweight design through the C2f module while preserving the SPPF module for multi-scale optimization. The Neck fuses multi-scale semantic features and introduces the Locally Embedded Global Feature Extraction Module (LEGM) to strengthen channel-level feature integration. The Head employs a decoupled architecture to segregate regression and classification tasks, thereby enabling efficient detection implementation.</p>
<p>Given the suboptimal object detection results in marine scene images, which is usually caused by factors such as sea surface reflection and variable scales, we introduce the LEGM module. Its theoretical basis lies in that the features extracted by convolutional networks contain a large amount of local information, and combining convolutional layers with self-attention mechanisms can simultaneously obtain local and global features, realizing effective feature fusion. The general structural framework of this module is illustrated in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>. This module can help solve the problems of missed detection or false detection that may occur when traditional one-stage detectors handle complex scenes. Through interactive operations on feature maps of different scales, it enables low-level local features to better utilize high-level semantic information, thereby improving detection accuracy.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>LEGM network architecture.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778155-g004.tif">
<alt-text content-type="machine-generated">A neural network architecture diagram illustrating a series of operations. It starts with three stacked blocks, passing through concatenation, convolution, normalization, linear layers, and softmax, leading to more linear and convolution layers, with several connections culminating in a final output block.</alt-text>
</graphic></fig>
<p>The LEGM module is primarily composed of three components:</p>
<list list-type="order">
<list-item>
<p>Input part: Analyze the multi-scale features of the input image and first perform concatenation.</p></list-item>
<list-item>
<p>Feature processing: Conduct preliminary processing on the concatenated features through a convolution layer (Conv) and a normalization layer (Norm); then perform transformation via a multi-layer perceptron (MLP), followed by further processing using a linear layer (Linear); then generate an attention weight matrix using the Softmax function.</p></list-item>
<list-item>
<p>Feature fusion: Conduct element-wise multiplication of the attention weight matrix and input features, then process the result with convolution and linear layers; next, add this output to the original input features, filter the combined result, and ultimately output the refined features using a multi-layer perceptron (MLP).</p></list-item>
</list>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>The loss function</title>
<p>During training, the object detection network utilizes full images as input, while the contrastive learning network employs positive and negative sample images. These two separate loss functions are merged into a single integrated loss function, detailed in <xref ref-type="disp-formula" rid="eq15">Equation 15</xref>.</p>
<disp-formula id="eq15"><label>(15)</label>
<mml:math display="block" id="M15"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mi>o</mml:mi><mml:mi>int</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x3bb;</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im40"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> denotes the object detection loss function, <inline-formula>
<mml:math display="inline" id="im41"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> stands for the contrastive learning loss function, and <inline-formula>
<mml:math display="inline" id="im42"><mml:mi>&#x3bb;</mml:mi></mml:math></inline-formula> refers to the weight coefficient assigned to the contrastive learning loss.</p>
<p>Loss function employs a multi-task joint optimization framework, integrating Bounding Box Regression Loss, Confidence Loss, and Classification Loss to optimize localization, confidence prediction, and classification tasks respectively. The overall expression is shown in <xref ref-type="disp-formula" rid="eq16">Equation 16</xref>.</p>
<disp-formula id="eq16"><label>(16)</label>
<mml:math display="block" id="M16"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>u</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>u</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:mi>U</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>u</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im43"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the classification loss; <inline-formula>
<mml:math display="inline" id="im44"><mml:mrow><mml:msub><mml:mi>u</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> is the weight coefficient of the classification loss, with a value of 0.5; <inline-formula>
<mml:math display="inline" id="im45"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:mi>U</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the bounding box regression loss; <inline-formula>
<mml:math display="inline" id="im46"><mml:mrow><mml:msub><mml:mi>u</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> is the weight coefficient of the bounding box regression loss, with a value of 7.5; <inline-formula>
<mml:math display="inline" id="im47"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the confidence loss; and <inline-formula>
<mml:math display="inline" id="im48"><mml:mrow><mml:msub><mml:mi>u</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> is the confidence loss coefficient, with a value of 1.5.</p>
<p>For Bounding Box Regression Loss, we use the CIoU (Complete Intersection over Union) loss function, which metric integrates overlap region, center-point distance, and aspect ratio variance to optimize the localization of predicted boxes. Unlike standard IoU, CloU offers more comprehensive supervision for box adjustments, particularly in handling aspect ratio discrepancies. The Confidence Loss addresses the challenge of significant positive-negative sample imbalance by applying the Focal Loss, which automatically reduces the impact of well-classified samples during training. For multi-label Classification Loss, the framework employs Binary Cross-Entropy (BCE) to distinguish object categories.</p>
<p>The framework employs cross-domain contrastive guidance to enable collaborative training. By jointly optimizing the corresponding loss with the detection objective, this mechanism generates domain-invariant features that are fed into the detection network. The resulting joint backpropagation iteratively updates parameters, improving detection performance and lowering deployment costs.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Experiments and results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Datasets</title>
<p>SeaDronessee (<xref ref-type="bibr" rid="B16">Varga et&#xa0;al., 2022</xref>) a large-scale benchmark for visual detection and tracking developed by the University of T&#xfc;bingen, focuses on human detection in marine environments. It contains over 54,000 frames (400k instances) captured from altitudes of 5&#x2013;260 meters and perspectives of 0&#x2013;90&#xb0;, with detailed metadata. The dataset supports multimodal system development for maritime search and rescue by providing altitude, perspective, and speed information to improve detection accuracy. It also includes multispectral imagery (e.g., near-infrared and red-edge) to enhance detection capability. The data is split into 5,630 training, 859 validation, and 1,796 test images.</p>
<p>The AFO datasets (<xref ref-type="bibr" rid="B19">Wang et&#xa0;al., 2025</xref>),the first open datasets for maritime search and rescue, consists of 39,991 annotated images extracted from 50 video clips, containing 3,647 object instances in total. These images are divided into training (67.4% of objects), testing (19.12%), and validation (13.48%) subsets. The datasets is derived from 40,000 drone-captured aerial videos featuring manually annotated humans and floating objects, many of small size.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Experimental setup and evaluation metrics</title>
<p>Our model was trained on an NVIDIA GeForce RTX 3070 Ti GPU under the Windows OS; the software setup consists of CUDA 11.8, CUDNN 8.0, and PyTorch as the deep learning framework. During training, we adopted the ADAM optimizer, configured with an initial learning rate of 0.015 and a momentum of 0.937; the batch size was set to 8, and the training was run for 200 epochs.</p>
<p>The evaluation framework employs precision, recall, mean Average Precision (mAP), computational complexity, and parameter count as core metrics. Precision quantifies the accuracy of positive predictions in classification tasks, while recall assesses the model&#x2019;s capability to identify all relevant positive instances. The mAP serves as a comprehensive indicator for object detection and information retrieval performance. Furthermore, parameter volume and computational demands are key measures of model complexity and efficiency. The evaluation indicators are shown in <xref ref-type="disp-formula" rid="eq17">Equations 17</xref>&#x2013;<xref ref-type="disp-formula" rid="eq19">19</xref>.</p>
<disp-formula id="eq17"><label>(17)</label>
<mml:math display="block" id="M17"><mml:mrow><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq18"><label>(18)</label>
<mml:math display="block" id="M18"><mml:mrow><mml:mi>R</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq19"><label>(19)</label>
<mml:math display="block" id="M19"><mml:mrow><mml:mi>m</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mrow><mml:mi>A</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Formatting of mathematical components</title>
<p>To analyze how various modules affect the proposed algorithm, this study carries out ablation experiments with YOLOv8n as the base model. The <inline-formula>
<mml:math display="inline" id="im49"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula> marker denotes that the relevant module is employed in the set up, while the x means it is not adopted. The evaluation results are shown in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Ablation experimental results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center"/>
<th valign="middle" align="center">YOLOv8</th>
<th valign="middle" align="center">Dual-domain comparison</th>
<th valign="middle" align="center">Low light degradation</th>
<th valign="middle" align="center">LEGM</th>
<th valign="middle" align="center">mAP@0.5</th>
<th valign="middle" align="center">mAP@0.5:0.95</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">Params (M)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">A</td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im50"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.908</td>
<td valign="middle" align="center">0.617</td>
<td valign="middle" align="center">0.912</td>
<td valign="middle" align="center">0.852</td>
<td valign="middle" align="center">8.1</td>
<td valign="middle" align="center">3.01</td>
</tr>
<tr>
<td valign="middle" align="center">B</td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im51"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im52"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.911</td>
<td valign="middle" align="center">0.62</td>
<td valign="middle" align="center">0.913</td>
<td valign="middle" align="center">0.855</td>
<td valign="middle" align="center">8.1</td>
<td valign="middle" align="center">3.01</td>
</tr>
<tr>
<td valign="middle" align="center">C</td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im53"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im54"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im55"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.92</td>
<td valign="middle" align="center">0.627</td>
<td valign="middle" align="center">0.899</td>
<td valign="middle" align="center">0.866</td>
<td valign="middle" align="center">8.1</td>
<td valign="middle" align="center">3.01</td>
</tr>
<tr>
<td valign="middle" align="center">D</td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im56"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im58"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im60"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im62"><mml:mo>&#x221a;</mml:mo></mml:math></inline-formula></td>
<td valign="middle" align="center">0.927</td>
<td valign="middle" align="center">0.628</td>
<td valign="middle" align="center">0.906</td>
<td valign="middle" align="center">0.872</td>
<td valign="middle" align="center">10.1</td>
<td valign="middle" align="center">4.07</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Algorithm B integrates the baseline model with dual-domain contrastive learning. As the contrastive component only guides the training of the detection network without introducing additional structural modules, the model maintains its original parameter size and computational load. Enhanced feature representations in both spatial and frequency domains lead to improvements across all four key metrics: P, R, mAP@0.5, and mAP@0.5:0.95. Algorithm C extends Algorithm B by adding a low-light degradation enhancement module, which effectively counters accuracy loss from uneven illumination and blurred small targets in maritime environments, thereby boosting detection performance. A side effect of this enhancement is the reduction in overall brightness, which increases false detection rates and consequently lowers Precision (P). Algorithm D further incorporates the LEGM module into the detection network. The strengthened capability of LEGM in capturing fine-grained features of small targets raises mAP@0.5 and mAP@0.5:0.95 to 92.7% and 62.8%, respectively. As summarized in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>, all proposed enhancements contribute to accuracy gains at varying levels, confirming the efficacy of our approach.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Formatting of mathematical components</title>
<p>To validate algorithm feasibility, a comparative analysis was conducted against established detectors including Fast-RCNN, SSD, YOLOv5s, YOLOv6n, YOLOv8s, YOLOv9t, and YOLOv10n. The comparative results on the SeaDronesSee dataset are summarized in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Performance comparison of various algorithms.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">mAP@0.5</th>
<th valign="middle" align="center">mAP@0.5:0.95</th>
<th valign="middle" align="center">FLOPs(G)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Fast-RCNN</td>
<td valign="middle" align="left">0.5779</td>
<td valign="middle" align="left">0.3408</td>
<td valign="middle" align="left">223.6</td>
</tr>
<tr>
<td valign="middle" align="center">SSD</td>
<td valign="middle" align="left">0.5747</td>
<td valign="middle" align="left">0.2707</td>
<td valign="middle" align="left">88.1</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5s</td>
<td valign="middle" align="left">0.654</td>
<td valign="middle" align="left">0.386</td>
<td valign="middle" align="left">23.8</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv6n</td>
<td valign="middle" align="left">0.606</td>
<td valign="middle" align="left">0.359</td>
<td valign="middle" align="left">11.8</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8s</td>
<td valign="middle" align="left">0.6465</td>
<td valign="middle" align="left">0.3891</td>
<td valign="middle" align="left">28.4</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8n(base)</td>
<td valign="middle" align="left">0.628</td>
<td valign="middle" align="left">0.376</td>
<td valign="middle" align="left">8.1</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv9t</td>
<td valign="middle" align="left">0.645</td>
<td valign="middle" align="left">0.379</td>
<td valign="middle" align="left">6.4</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10n</td>
<td valign="middle" align="left">0.641</td>
<td valign="middle" align="left">0.382</td>
<td valign="middle" align="left">8.2</td>
</tr>
<tr>
<td valign="middle" align="center">The Proposed Algorithm</td>
<td valign="middle" align="left">0.671%</td>
<td valign="middle" align="left">0.395</td>
<td valign="middle" align="left">10.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>, compared to the two-stage object detection algorithm Fast-RCNN, our algorithm achieves an improvement of 9.31% in mAP@0.5 and 5.42% in mAP@0.5:0.95 on the SeaDronesSee dataset. Compared to the single-stage object detection algorithm SSD, it improves mAP@0.5 by 9.63% and mAP@0.5:0.95 by 12.43%. Furthermore, compared to the YOLO series algorithms, our method achieves improvements in mAP@0.5 of 1.7%, 6.5%, 2.45%, 4.3%, 2.6% and 3% against YOLOv5s, YOLOv6n, YOLOv8s, YOLOv8n, YOLOv9t and YOLOv10n, respectively, and improvements in mAP@0.5:0.95 of 0.9%, 3.6%, 0.59%, 1.9%, 1.6% and 1.3%, respectively. In terms of computational efficiency, our model requires 10.1G FLOPs, reflecting a good balance between accuracy and complexity. Although YOLOv9t requires only 6.4G FLOPs, making it the most lightweight among the compared algorithms, our model achieves higher detection accuracy with moderate computational overhead. In summary, compared to other models, our detection algorithm not only has the smallest number of parameters but also achieves the highest average precision, demonstrating certain advantages in both accuracy and real-time performance for maritime small object detection.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Visualization</title>
<p>A subset of images representing various marine environments, specifically in sea conditions, illumination, viewpoints, and target scales, was randomly drawn from the AFO datasets for detection evaluation. As illustrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, the three rows correspond to original images, detection outputs of the YOLOv8 model, and results of our proposed method.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Visualize the result. <bold>(a)</bold> Original images. <bold>(b)</bold> Detection outputs of the YOLOv8 model. <bold>(c)</bold> Results of our proposed method.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778155-g005.tif">
<alt-text content-type="machine-generated">Three sets of aerial photos labeled (a), (b), and (c) compare aquaticscenes. Each set includes four images: the first shows watercraft on a greenish watersurface,; the second features a boat in open bluewater;the third includes vegetation and boats the four depicts a shoreline with boats. in sets (b) and (c),colored boxes indicate detected elements.</alt-text>
</graphic></fig>
<p>Through qualitative analysis of the detection results shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, this section reveals the performance differences between the original YOLOv8 algorithm and the proposed algorithm in marine small target detection. The following discussion explores several fundamental limitations exposed by the original YOLOv8 algorithm in the marine small target detection task from multiple dimensions:</p>
<p>Detection Failure in Target-Dense Scenes: In the first column of dense target images, the YOLOv8 algorithm exhibits both missed detection and false detection of small targets, as indicated by the blue boxes in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>. This issue stems from insufficient feature discrimination capability in complex backgrounds. When multiple small targets are densely distributed, the receptive field design of YOLOv8 encounters challenges in effectively distinguishing adjacent targets, thereby resulting in feature confusion during the feature extraction procedure. Additionally, interference factors such as wave textures and light reflections in marine environments are misidentified as targets, resulting in false detection.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Missed detection and false detection in the original YOLOv8 algorithm.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-13-1778155-g006.tif">
<alt-text content-type="machine-generated">Aerial view of people and paddleboards in shallow, clear water.Red boxes with labels and confidence scores identify humans and paddleboards. Two blue circles highlight the false detection and missed detection areas in object detection. Vegetation is visible beneath the water.</alt-text>
</graphic></fig>
<p>Missed Detection of Single Target: It is particularly noteworthy that in the second column image, despite ideal lighting conditions and the presence of only a single boat target in the scene, YOLOv8 still exhibited a significant missed detection. This indicates that the issue arises not only from environmental complexity but also from the algorithm&#x2019;s inherent insufficient sensitivity to the features of small targets. The feature information of small targets is severely lost during multiple down-sampling processes, preventing the model from retaining sufficient discriminative information in the deep feature maps.</p>
<p>Insufficient Model Generalization Capability: The false detection phenomena observed in the third and fourth columns under similar lighting conditions further confirm the limitations of YOLOv8&#x2019;s generalization ability in marine environments. The algorithm exhibits poor adaptability to variations in lighting and background interference, making it difficult to distinguish real targets from visually similar noise.</p>
<p>In summary, through the synergistic effect of dual-domain contrastive learning, low-light degradation enhancement, and the LEGM module, the proposed algorithm effectively addresses the inherent limitations of YOLOv8 in marine small target detection. It significantly reduces both missed and false detection rates while maintaining high confidence, demonstrating clear performance superiority and practical application value.</p>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusions</title>
<p>This study proposes an innovative method for small target detection in unmanned aerial vehicle (UAV) maritime environments. Its core innovation lies in the joint training of a target detection network with dual-domain contrastive learning (incorporating both spatial and frequency domains). A low-light degradation enhancement module is introduced prior to the dual-domain contrastive learning for producing varied training samples and boosting the model&#x2019;s robustness against noise and blurring. Simultaneously, an LEGM module is embedded into the detection network to strengthen the integration of local and global features, which notably elevates the detection precision and stability of small objects in intricate marine scenes. However, limitations remain, such as insufficient adaptability to extreme conditions like heavy rain and dense fog, and computational complexity that may constrain real-time deployment on resource-limited UAVs. Potential future work may investigate multi-modal data fusion (such as infrared and radar signals) to improve cross-domain generalization performance, develop dynamic adaptive mechanisms to cope with sudden environmental changes, and further optimize model lightweighting and edge deployment efficiency.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>LL: Software, Visualization, Formal Analysis, Conceptualization, Methodology, Writing &#x2013; original draft, Investigation. JX: Methodology, Data curation, Validation, Writing &#x2013; original draft. GF: Funding acquisition, Supervision, Conceptualization, Writing &#x2013; review &amp; editing. XZ: Resources, Writing &#x2013; review &amp; editing, Project administration. YW: Visualization, Writing &#x2013; review &amp; editing, Investigation. FR: Validation, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s9" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cheng</surname> <given-names>G.</given-names></name>
<name><surname>Yuan</surname> <given-names>X.</given-names></name>
<name><surname>Yao</surname> <given-names>X.</given-names></name>
<name><surname>Yan</surname> <given-names>K.</given-names></name>
<name><surname>Zeng</surname> <given-names>Q.</given-names></name>
<name><surname>Xie</surname> <given-names>X.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Towards large-scale small object detection: Survey and benchmarks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source> <volume>45</volume>:<page-range>13467&#x2013;13488</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2023.3290594</pub-id>, PMID: <pub-id pub-id-type="pmid">37384469</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ding</surname> <given-names>S.</given-names></name>
<name><surname>Zhi</surname> <given-names>X.</given-names></name>
<name><surname>Lyu</surname> <given-names>Y.</given-names></name>
<name><surname>Ji</surname> <given-names>Y.</given-names></name>
<name><surname>Guo</surname> <given-names>W.</given-names></name>
</person-group> (<year>2024</year>a). 
<article-title>Deep learning for daily 2-m temperature downscaling</article-title>. <source>Earth Space Sci.</source> <volume>11</volume>, <fpage>e2023EA003227</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1029/2023EA003227</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
</person-group> (<year>2015</year>). &#x201c;
<article-title>FastR-CNN</article-title>,&#x201d; in <source>2015 IEEE International Conference on Computer Vision (ICCV)</source>. <fpage>1440</fpage>&#x2013;<lpage>1448</lpage>. <publisher-loc>Santiago, Chile</publisher-loc>: 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Hsu</surname> <given-names>H. K.</given-names></name>
<name><surname>Yao</surname> <given-names>C. H.</given-names></name>
<name><surname>Tsai</surname> <given-names>Y. H.</given-names></name>
<name><surname>Hung</surname> <given-names>W. C.</given-names></name>
<name><surname>Tseng</surname> <given-names>H. Y.</given-names></name>
<name><surname>Singh</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). &#x201c;
<article-title>Progressive domain adaptation for object detection</article-title>,&#x201d; in <conf-name>2020 IEEE Winter Conference on Applications of Computer Vision (WACV)</conf-name>. <fpage>749</fpage>&#x2013;<lpage>757</lpage>. <publisher-loc>Snowmass, CO, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hu</surname> <given-names>Q.</given-names></name>
<name><surname>Hu</surname> <given-names>S.</given-names></name>
<name><surname>Liu</surname> <given-names>S.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>BANet: A balance attention network for anchor-free ship detection in SAR images</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2022.3146027</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>T. Y.</given-names></name>
<name><surname>Goyal</surname> <given-names>P.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Doll&#xe1;r</surname> <given-names>P.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Focal loss for dense object detection</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Computer Vision (ICCV)</conf-name>. <fpage>2980</fpage>&#x2013;<lpage>2988</lpage>. <publisher-loc>Venice, Italy</publisher-loc>: 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>T. Y.</given-names></name>
<name><surname>Maire</surname> <given-names>M.</given-names></name>
<name><surname>Belongie</surname> <given-names>S.</given-names></name>
<name><surname>Hays</surname> <given-names>J.</given-names></name>
<name><surname>Perona</surname> <given-names>P.</given-names></name>
<name><surname>Ramanan</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2014</year>). &#x201c;
<article-title>Microsoft coco: Common objects in context</article-title>,&#x201d; in <conf-name>Computer VisionECCV 2014: 13th European Conference, urich, Switzerland, September 6-12, 2014, Proceedings, PartV 13</conf-name>. <fpage>740</fpage>&#x2013;<lpage>755</lpage> (<publisher-loc>Cham, Switzerland</publisher-loc>: 
<publisher-name>Springer</publisher-name>).
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Hu</surname> <given-names>Z.</given-names></name>
<name><surname>Dai</surname> <given-names>Y.</given-names></name>
<name><surname>Ma</surname> <given-names>X.</given-names></name>
<name><surname>Deng</surname> <given-names>P.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Isa: Ingenious siamese attention for object detection algorithms towardscomplex scenes</article-title>. <source>ISA Trans.</source> <volume>143</volume>, <fpage>205</fpage>&#x2013;<lpage>220</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isatra.2023.09.001</pub-id>, PMID: <pub-id pub-id-type="pmid">37704556</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>W.</given-names></name>
<name><surname>Ren</surname> <given-names>G.</given-names></name>
<name><surname>Yu</surname> <given-names>R.</given-names></name>
<name><surname>Guo</surname> <given-names>S.</given-names></name>
<name><surname>Zhu</surname> <given-names>J.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). &#x201c;
<article-title>Image-adaptive YOLO for object detection in adverse weather conditions</article-title>,&#x201d; in <conf-name>Proceedings of the 39th Annual AAAI Conference on Artificial Intelligence</conf-name>, Vol. <volume>36</volume>. <fpage>1792</fpage>&#x2013;<lpage>1800</lpage>. <publisher-loc>Washington, DC, USA</publisher-loc>: 
<publisher-name>AAAI Press</publisher-name>.
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ma</surname> <given-names>Y.</given-names></name>
<name><surname>Guan</surname> <given-names>D.</given-names></name>
<name><surname>Deng</surname> <given-names>Y.</given-names></name>
<name><surname>Yuan</surname> <given-names>W.</given-names></name>
<name><surname>Wei</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>3SD-Net: SAR small ship detection neural network</article-title>. <source>IEEE Trans. On Geosci. And Remote Sens</source> <volume>62</volume>:<page-range>1&#x2013;13</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2024.3454308</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Qian</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
<name><surname>Ming</surname> <given-names>H.</given-names></name>
<name><surname>Yang</surname> <given-names>D.</given-names></name>
<name><surname>Ren</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>a). 
<article-title>An adaptive data augmentation-based reliability evaluation and analysis of lithium-ion batteries considering significant inconsistency in degradation</article-title>. <source>J. Energy Storage</source> <volume>134</volume>, <fpage>118158</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.est.2025.118158</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Qian</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Zhu</surname> <given-names>Y.</given-names></name>
<name><surname>Yang</surname> <given-names>D.</given-names></name>
<name><surname>Ren</surname> <given-names>Y.</given-names></name>
<name><surname>Xia</surname> <given-names>Q.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>b). 
<article-title>Remaining useful life prediction considering correlated multi-parameter nonlinear degradation and small sample conditions</article-title>. <source>Comput. Ind. Eng.</source> <volume>210</volume>, <fpage>111567</fpage>. doi:&#xa0;10.1016/j.cie.2025.111567 doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>, PMID: <pub-id pub-id-type="pmid">27295650</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Redmon</surname> <given-names>J.</given-names></name>
<name><surname>Divvala</surname> <given-names>S.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>Farhadi</surname> <given-names>A.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>You only look once: uni&#x2043;fied, real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conferenceon Computer Vision and Pattern Recognition</conf-name>. <fpage>779</fpage>&#x2013;<lpage>788</lpage>.
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ren</surname> <given-names>S.</given-names></name>
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2016</year>). 
<article-title>FasterR-CNN: Towards real-time object detection with region proposal networks</article-title>. <source>IEEE Transactions on Pattern Analysis and Machine Intelligence</source>. <volume>39</volume>, <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. <publisher-loc>USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>, PMID: <pub-id pub-id-type="pmid">27295650</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tu</surname> <given-names>X.</given-names></name>
<name><surname>He</surname> <given-names>Z.</given-names></name>
<name><surname>Fu</surname> <given-names>G.</given-names></name>
<name><surname>Liu</surname> <given-names>J.</given-names></name>
<name><surname>Zhong</surname> <given-names>M.</given-names></name>
<name><surname>Zhou</surname> <given-names>C.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Learn discriminative features for small object detection through multi-scale image degradation with contrastive learning</article-title>. <source>IEICE Trans. Inf. Syst</source> <volume>E108D</volume>:<page-range>371&#x2013;383</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1587/transinf.2024EDP7204</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Varga</surname> <given-names>L. A.</given-names></name>
<name><surname>Kiefer</surname> <given-names>B.</given-names></name>
<name><surname>Messmer</surname> <given-names>M.</given-names></name>
<name><surname>Zell</surname> <given-names>A.</given-names></name>
</person-group> (<year>2022</year>). &#x201c;
<article-title>Seadronessee: A maritime benchmark for detecting humans in open water</article-title>,&#x201d; in <conf-name>2022 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)</conf-name>. <fpage>2260</fpage>&#x2013;<lpage>2270</lpage>. <publisher-loc>Waikoloa, HI, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>S.</given-names></name>
<name><surname>Cai</surname></name>
<name><surname>Yuan</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Automatic SAR ship detection based on multifeature fusion network in spatialand frequency domains</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>61</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2023.3267495</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>R.</given-names></name>
<name><surname>Liang</surname> <given-names>T.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>The &#x201c;Belt and Road&#x201d; Initiative and China&#x2019;s sporting goods exports: Basic characteristics and policy evaluation</article-title>. <source>Heliyon</source> <volume>10</volume>, <elocation-id>e33189</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e33189</pub-id>, PMID: <pub-id pub-id-type="pmid">39035513</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Liu</surname> <given-names>J.</given-names></name>
<name><surname>Zhao</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>Z.</given-names></name>
<name><surname>Yan</surname> <given-names>Y.</given-names></name>
<name><surname>Yan</surname> <given-names>X.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>LCSC-UAVNet: A high-precision and lightweight model for small-object identification and detection in maritime UAV perspective</article-title>. <source>Drones (2504-446X)</source> <volume>9</volume>, <page-range>100</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/drones9020100</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>Z.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Jiang</surname> <given-names>Y. G.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Visual co-occurrence alignment learning for weakly-supervised video moment retrieval</article-title>,&#x201d; in <conf-name>Proceedings of the 29th ACM International Conference on Multimedia</conf-name>. <fpage>1459</fpage>&#x2013;<lpage>1468</lpage>. <publisher-loc>New York, NY, United States</publisher-loc>: 
<publisher-name>Association for Computing Machinery (ACM)</publisher-name>.
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>Z. Y.</given-names></name>
<name><surname>Cao</surname> <given-names>X.</given-names></name>
<name><surname>Xu</surname> <given-names>R. Z.</given-names></name>
<name><surname>Hong</surname> <given-names>W. C.</given-names></name>
<name><surname>Sun</surname> <given-names>S. L.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Applications of chaotic quantum adaptive satin bower bird optimizer algorithm in berth-tugboat-quay crane allocation optimization</article-title>. <source>Expert Syst. Appl.</source> <volume>237</volume>, <fpage>121471</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2023.121471</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yasir</surname> <given-names>M.</given-names></name>
<name><surname>Liu</surname> <given-names>S.</given-names></name>
<name><surname>Mingming</surname> <given-names>X.</given-names></name>
<name><surname>Wan</surname> <given-names>J.</given-names></name>
<name><surname>Pirasteh</surname> <given-names>S.</given-names></name>
<name><surname>Dang</surname> <given-names>K. B.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>ShipGeoNet: SAR image-based geometric feature extraction of ships using convolutional neural networks</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>62</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2024.3352150</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Yuanbo</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>). <source>Application of Object Detection and Image Generation in Capsule Endoscopy Assisted Diagnosis System</source> (<publisher-loc>Singapore</publisher-loc>: 
<publisher-name>National University of Singapore</publisher-name>).
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yue</surname> <given-names>T.</given-names></name>
<name><surname>Yang</surname> <given-names>Y.</given-names></name>
<name><surname>Niu</surname> <given-names>J. M.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>A light-weight ship detection and recognition method based onYOLOv4</article-title>,&#x201d; in <conf-name>2021 4th International Conference on Advanced Electronic Materials, Computers andSoftware Engineering (AEMCSE)</conf-name>. <fpage>661</fpage>&#x2013;<lpage>670</lpage> (<publisher-loc>Changsha, China</publisher-loc>: 
<publisher-name>IEEE</publisher-name>).
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Dayoub</surname> <given-names>F.</given-names></name>
<name><surname>Sunderhauf</surname> <given-names>N.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Varifocalnet: An iou-aware dense object detector</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <fpage>8514</fpage>&#x2013;<lpage>8523</lpage>. <publisher-loc>Nashville, TN, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>A.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Qi</surname> <given-names>X.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2022</year>). &#x201c;
<article-title>Progressive end-to-end object detection in crowded scenes</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <fpage>857</fpage>&#x2013;<lpage>866</lpage>. <publisher-loc>Los Alamitos, CA, USA</publisher-loc>: 
<publisher-name>IEEE Computer Society</publisher-name>.
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3203417">Zifei Xu</ext-link>, University of Liverpool, United Kingdom</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3334742">Xuri Xin</ext-link>, Liverpool John Moores University, United Kingdom</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3343558">Chengbo Wang</ext-link>, Xidian University, China</p></fn>
</fn-group>
</back>
</article>