<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2024.1389553</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Marine Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Learning hybrid dynamic transformers for underwater image super-resolution</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>He</surname>
<given-names>Xin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2612506"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Junjie</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jia</surname>
<given-names>Tong</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Basic Sciences for Aviation, Naval Aviation University</institution>, <addr-line>Yantai</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Electromechanical and Automotive Engineering, Yantai University</institution>, <addr-line>Yantai</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>School of Art and Design, Yantai Institute of Science and Technology</institution>, <addr-line>Yantai</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Xinyu Zhang, Dalian Maritime University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Feihu Zhang, Northwestern Polytechnical University, China</p>
<p>Xuebo Zhang, Northwest Normal University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Xin He, <email xlink:href="mailto:hexin6770@163.com">hexin6770@163.com</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>25</day>
<month>04</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1389553</elocation-id>
<history>
<date date-type="received">
<day>21</day>
<month>02</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>08</day>
<month>04</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 He, Li and Jia</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>He, Li and Jia</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Underwater image super-resolution is vital for enhancing the clarity and detail of underwater imagery, enabling improved analysis, navigation, and exploration in underwater environments where visual quality is typically degraded due to factors like water turbidity and light attenuation. In this paper, we propose an effective hybrid dynamic Transformer (called HDT-Net) for underwater image super-resolution, leveraging a collaborative exploration of both local and global information aggregation to help image restoration. Firstly, we introduce a dynamic local self-attention to adaptively capture important spatial details in degraded underwater images by employing dynamic weighting. Secondly, considering that visual transformers tend to introduce irrelevant information when modeling the global context, thereby interfering with the reconstruction of high-resolution images, we design a sparse non-local self-attention to more accurately compute self-similarity by setting a top-k threshold. Finally, we integrate these two self-attention mechanisms into the hybrid dynamic transformer module, constituting the primary feature extraction unit of our proposed method. Quantitative and qualitative analyses on benchmark datasets demonstrate that our approach achieves superior performance compared to previous CNN and Transformer models.</p>
</abstract>
<kwd-group>
<kwd>underwater image</kwd>
<kwd>image super-resolution</kwd>
<kwd>local self-attention</kwd>
<kwd>sparse self-attention</kwd>
<kwd>deep learning</kwd>
<kwd>visual transformer</kwd>
</kwd-group>
<counts>
<fig-count count="7"/>
<table-count count="2"/>
<equation-count count="20"/>
<ref-count count="38"/>
<page-count count="11"/>
<word-count count="5531"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Ocean Observation</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Underwater imaging poses distinct challenges owing to the natural attenuation, scattering, and color distortion of light within aquatic environments. These factors contribute to degraded image quality, thereby constraining the effectiveness of underwater observation, exploration, and surveillance systems (refer to <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>). Consequently, underwater image enhancement techniques, notably super-resolution, have attracted considerable attention in recent years. Super-resolution aims to reconstruct high-resolution images from low-resolution counterparts, thereby improving image clarity and detail. It holds immense potential for enhancing the visual quality of underwater scenes, facilitating better analysis, interpretation, and decision-making in various marine applications, such as ocean-observation and offshore engineering (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2024</xref>).</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The physical imaging process of underwater conditions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1389553-g001.tif"/>
</fig>
<p>Despite recent advancements, underwater image super-resolution remains an active area of research, with ongoing efforts to develop robust and efficient algorithms capable of addressing the specific challenges.</p>
<p>posed by underwater environments (<xref ref-type="bibr" rid="B10">Islam et&#xa0;al., 2020a</xref>). Early efforts in underwater image super-resolution predominantly relied on traditional interpolation algorithms such as bicubic and bilinear. These approaches, while widely used in conventional image processing tasks, often yielded suboptimal results when applied to underwater imagery due to the unique characteristics of underwater environments.</p>
<p>In recent years, significant strides have been made in leveraging deep learning techniques, particularly convolutional neural networks (CNNs), for underwater image super-resolution. Unlike conventional interpolation methods, CNN-based approaches harness the capabilities of deep learning to discern intricate mappings between low-resolution and high-resolution image pairs directly from data. Various architectures, such as SRCNN (Super-Resolution Convolutional Neural Network) (<xref ref-type="bibr" rid="B5">Dong et&#xa0;al., 2015</xref>), VDSR (Very Deep Super-Resolution) (<xref ref-type="bibr" rid="B13">Kim et&#xa0;al., 2016</xref>), and EDSR (Enhanced Deep Super-Resolution) (<xref ref-type="bibr" rid="B22">Lim et&#xa0;al., 2017</xref>), have demonstrated remarkable performance improvements over those of conventional approaches. Subsequent research tends to focus on developing larger and deeper CNN models to enhance learning capabilities. However, due to the extensive number of network parameters, the computational cost of these methods is considerably high, limiting their applicability in real-world underwater scenarios (<xref ref-type="bibr" rid="B12">Jiang et&#xa0;al., 2021</xref>).</p>
<p>Later, Transformer-based architectures (<xref ref-type="bibr" rid="B30">Vaswani et&#xa0;al., 2017</xref>) have emerged as promising alternatives for underwater image super-resolution, offering unique advantages over CNN-based approaches. Unlike CNNs, Transformers leverage self-attention mechanisms to capture global dependencies and long-range dependencies within the input data (<xref ref-type="bibr" rid="B8">Han et&#xa0;al., 2022</xref>). For example, SwinIR (<xref ref-type="bibr" rid="B21">Liang et&#xa0;al., 2021</xref>) employs the window-based attention mechanism to better solve image super-resolution. Although the self-attention mechanism in the sliding window approach enables the extraction of local features, the discontinuity of the windows limits the ability to model local features within each window. In other words, these window-based image super-resolution methods are unable to aggregate information from outside the window, thus limiting the capability to model global information (<xref ref-type="bibr" rid="B17">Li et&#xa0;al., 2023a</xref>).</p>
<p>Indeed, the complexity and variability inherent in underwater environments elevate the challenges associated with underwater image super-resolution beyond those encountered in natural image superresolution tasks. The Transformer model, renowned for its adeptness in capturing global features, tends to introduce noticeable redundancy during the modeling process. Regrettably, this aspect has often been neglected in prior Transformer-based super-resolution approaches (<xref ref-type="bibr" rid="B34">Xiao et&#xa0;al., 2024</xref>). Therefore, developing a method to explore the characteristics of Transformers, aiming to better integrate both local and global features for joint modeling to achieve high-quality image reconstruction while reducing computational costs, holds significant promise.</p>
<p>To this end, we develop an effective hybrid dynamic Transformer (called HDT-Net) to solve underwater image super-resolution. The proposed method combines dynamic local self-attention with sparse non-local self-attention to synergistically enhance the representation capability of the Transformer model. The former dynamically explores local feature relationships based on a fully CNN model, mitigating errors induced by discontinuous windows. The latter aggregates features by selecting the most useful similarity values, alleviating redundancy caused by small self-attention weights. These strategies are carefully designed to address the challenges of complex underwater environments, thereby leveraging more effective feature information to improve the quality of image super-resolution. Finally, experimental validation on benchmark datasets confirms the effectiveness of the proposed approach.</p>
<p>In summary, the main contributions of this paper are as follows:</p>
<list list-type="bullet">
<list-item>
<p>We propose a lightweight deep model based on a hybrid Transformer for underwater image super-resolution tasks, aiming to enhance the quality of image reconstruction by jointly exploiting local and global features representation.</p>
</list-item>
<list-item>
<p>We integrate a dynamic local self-attention and a sparse non-local self-attention to enable better capture of local and global feature information respectively, making the Transformer more effective and compact in long-range modeling.</p>
</list-item>
<list-item>
<p>Experimental evaluation on commonly used benchmark datasets for underwater image super-resolution demonstrates that our method outperforms previous CNN and Transformer-based approaches both quantitatively and qualitatively.</p>
</list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>In this section, we present a review of recent work related to underwater image super-resolution and vision transformer.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Underwater image super-resolution</title>
<p>Underwater image super-resolution is an uncertain task, and numerous studies have been conducted to explore suitable methods to address this challenge. Among the deep learning-based underwater image super-resolution models, CNN is one of the most common techniques. Shin et&#xa0;al. (<xref ref-type="bibr" rid="B29">Shin et&#xa0;al., 2016</xref>). proposed a CNN-based framework for estimating environmental light and transmission, featuring a versatile convolutional structure designed to mitigate haze in underwater images. Wang et&#xa0;al. (<xref ref-type="bibr" rid="B33">Wang et&#xa0;al., 2017</xref>). proposed a CNN-based underwater image enhancement framework called UIE-Net, comprising two sub-networks: CC-Net and HR-Net. CC-Net outputs color absorption coefficients for different channels to correct color distortion in&#xa0;underwater images. HR-Net outputs light attenuation transmission maps to enhance the contrast of underwater images. Li et&#xa0;al. (<xref ref-type="bibr" rid="B20">Li et&#xa0;al., 2017</xref>). proposed a novel generator network structure that combines the underwater image formation process to generate high-resolution output images. Subsequently, a dense pixel-level model learning pipeline is employed to perform color correction on monocular underwater images trained based on RGB-D and their corresponding generated images. The methods describe above address some aspects of underwater image super-resolution, yet they still exhibit a lack of robustness when handling highly complex underwater scenes.</p>
<p>Li et&#xa0;al. (<xref ref-type="bibr" rid="B18">Li et&#xa0;al., 2019</xref>). constructed an underwater image enhancement benchmark dataset, which provides a large-scale collection of real underwater images along with their corresponding reference images. This benchmark dataset facilitates comprehensive research on existing underwater image enhancement methods and enables easy training of CNNs for underwater image enhancement. But it lacks novelty in terms of algorithmic advancements compared to other methods. Guo et&#xa0;al. (<xref ref-type="bibr" rid="B7">Guo et&#xa0;al., 2019</xref>). proposed an underwater image enhancement method based on GAN. Additionally, the introduced MSDB combined with residual learning can improve network performance, while multiple loss functions can generate visually satisfactory enhancement results. Islam et&#xa0;al. (<xref ref-type="bibr" rid="B11">Islam et&#xa0;al., 2020b</xref>). proposed a simple yet effective underwater image enhancement model based on conditional genetic algorithms. This model evaluates image quality by incorporating global color, content, local texture, and style information to establish a perceptual loss function. Additionally, they provided a large-scale dataset consisting of paired and unpaired underwater image collections for supervised training. Chen et&#xa0;al. (<xref ref-type="bibr" rid="B3">Chen et&#xa0;al., 2020</xref>). proposed an improved deep reinforcement convolutional neural network based on deep learning principles. The main innovation involves incorporating wavelet bases into turbulence-based deep learning convolutional kernels, introducing an improved dense block structure. Further investigation is needed to assess the generalization of the methods utilized in the aforementioned studies to different underwater conditions.</p>
<p>Recently, Li et&#xa0;al. (<xref ref-type="bibr" rid="B16">Li et&#xa0;al., 2021</xref>). proposed a deep underwater image enhancement model. This model learns feature representations from different color spaces and highlights the most discriminative features through channel attention modules. Additionally, domain knowledge is integrated into the network by utilizing inverse media transmission maps as attention weights. Li et&#xa0;al. (<xref ref-type="bibr" rid="B19">Li et&#xa0;al., 2023b</xref>). proposed a novel method for realistic underwater image enhancement and super-resolution called RUIESR. Its purpose is to obtain paired data consistent with realistic degradation for training and to accurately estimate dual degradation to assist in reconstruction. In deep-sea or heavily polluted waters, the degradation characteristics may differ from those observed in the training data, potentially affecting the performance of the above methods. Dharejo et&#xa0;al. (<xref ref-type="bibr" rid="B4">Dharejo et&#xa0;al., 2024</xref>). investigated the integration of a typical Swin transformer with wave attention modules and reversible downsampling to achieve efficient multiscale self-attention learning with lossless downsampling. As a potential improvement over SwinIR, this model allows for faster training and convergence, as well as greater capacity and resolution. The computational complexity and resource requirements of this Transformer-based method may pose challenges.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Vision transformer</title>
<p>Vision Transformer (ViT) (<xref ref-type="bibr" rid="B30">Vaswani et&#xa0;al., 2017</xref>) is a model based on the Transformer architecture, initially proposed by Dosovitskiy et&#xa0;al. (<xref ref-type="bibr" rid="B6">Dosovitskiy et&#xa0;al., 2020</xref>). in 2020 to address image classification tasks in the field of computer vision. The introduction of ViT signifies the expansion of Transformer models from the domain of natural language processing to computer vision, ushering in a new paradigm for image processing tasks. Liang et&#xa0;al. (<xref ref-type="bibr" rid="B21">Liang et&#xa0;al., 2021</xref>). proposed an image restoration model called SwinIR. This model consists of three modules: shallow feature extraction, deep feature extraction, and HR reconstruction. It emphasizes the content-based interaction between image content and attention weights, achieved through a shifting window mechanism for long-range dependency modeling. The IPT (<xref ref-type="bibr" rid="B1">Chen et&#xa0;al., 2021</xref>) employs a multi-head, multi-tail, shared transformer body design, aiming to maximize the potential of the transformer architecture in serving various image processing tasks such as image super-resolution and denoising. The high computational complexity arising from this Transformer design may limit scalability to high-resolution images.</p>
<p>DRSAN (<xref ref-type="bibr" rid="B27">Park et&#xa0;al., 2021</xref>) proposes a dynamic residual network solution for lightweight super-resolution systems, leveraging different combinations of residual features considering input statistics. Additionally, it introduces residual self-attention, which, in collaboration with residual structures, enhances network performance without adding modules. Zamir et&#xa0;al. (<xref ref-type="bibr" rid="B36">Zamir et&#xa0;al., 2022</xref>). introduced Restormer, an image restoration transformer model known for its high computational efficiency in handling high-resolution images. They made critical design adjustments to the core components of the transformer block to enhance feature aggregation and transformation. To integrate the robustness of CNNs into the Transformer model, Restormer incorporates deep convolutions for encoding spatial local context. ELAN (<xref ref-type="bibr" rid="B38">Zhang et&#xa0;al., 2022</xref>) utilizes shift convolution (shift-conv) to effectively extract local structural information from the image. Subsequently, it introduces an intra-group multi-scale self-attention (GMSA) module to leverage the long-range dependency of the image. Further acceleration of the model&#x2019;s computation is achieved by employing a shared attention mechanism. In the task of image super-resolution, the effectiveness of integrating local and global feature representations in the aforementioned methods still requires further improvement.</p>
<p>Diverging from current approaches, we introduce a lightweight deep model rooted in a hybrid dynamic Transformer (HDT-Net). The goal is to bolster the quality of image reconstruction by synergizing local and global feature representations.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Proposed method</title>
<p>In this section, we first describe the overall pipeline of the model. Then, we provide details of the hybrid dynamic transformer module (HDTM), which serve as the fundamental building modules of the approach. HDTM is composed of four identical hybrid dynamic transformer blocks (HDTBs) connected end to end, as illustrated in the <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>. The HDTB mainly comprises three key elements: dynamic local self-attention (DLSA), sparse non-local self-attention (SNSA), and feed-forward network (FFN).</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The overall architecture of the proposed network.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1389553-g002.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>Overall pipeline</title>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> illustrates an overview of the proposed HDT-Net for underwater image super-resolution. Specifically, the low-resolution underwater image is first processed through a convolutional layer with a filter size of 3&#xd7;3 pixels for shallow feature extraction. Subsequently, the feature information is sequentially processed through six identical HDTMs for deep feature extraction and fusion, both locally and globally. Within each HDTM, four internal HDTBs are connected end to end for processing, and the extracted features are finally passed to the next module through a 3 &#xd7; 3 convolution. After the completion of HDTM processing, the features are further projected using a convolutional layer with a filter size of 3 &#xd7; 3 pixels. Following that, high-resolution image reconstruction is performed through a 3 &#xd7; 3 convolution and upsampling operation using PixelShuffle (<xref ref-type="bibr" rid="B28">Shi et&#xa0;al., 2016</xref>).</p>
<p>The process of the overall pipeline can be represented as <xref ref-type="disp-formula" rid="eq1">Equations 1</xref>-<xref ref-type="disp-formula" rid="eq4">4</xref>:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msup>
<mml:mtext>X</mml:mtext>
<mml:mo>'</mml:mo>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>X</mml:mtext>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mn>6</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mstyle>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mstyle>
<mml:mo>+</mml:mo>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mstyle>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mtext>high</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>&#xa0;</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the input features and 3 &#xd7; 3 convolution, upsampling operation using PixelShuffle, low resolution image features and high resolution image features, respectively. The process of HDTM in the overall pipeline can be expressed as <xref ref-type="disp-formula" rid="eq5">Equations 5</xref>, <xref ref-type="disp-formula" rid="eq6">6</xref>:</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>M</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Hybrid dynamic transformer block</title>
<p>We propose a hybrid dynamic transformer block consisting of DLSA, SNSA, and FFN. By combining DLSA and SNSA, the hybrid self-attention mechanism effectively weights each position against others in the input data, facilitating the integration of global information into each position&#x2019;s representation. Moreover, it enables the capturing of both global and local feature relationships at different positions in the image, allowing the model to capture long-range dependencies in the data. After each self-attention computation, the representation at each position undergoes non-linear transformation through FFN, mapping it to a new representation space to enhance the model&#x2019;s expressiveness. Formally, given the input features of the (<italic>l</italic> &#x2212; 1)-th block <bold>X</bold>
<italic>
<sub>l</sub>
</italic>
<sub>&#x2212;1</sub>, the encoding of the HDTB process can be represented as <xref ref-type="disp-formula" rid="eq7">Equations 7</xref>&#x2013;<xref ref-type="disp-formula" rid="eq10">10</xref>:</p>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>D</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>f</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>f</mml:mi>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>f</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>LN</italic> denotes the layer normalization, <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. d <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent the outputs of DLSA and SNSA, <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
<mml:mi>f</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. d <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mi>l</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the outputs of FFN, which are described below.</p>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Dynamic local self-attention</title>
<p>To enhance the extraction and fusion of local features, we introduce a DLSA method aimed at capturing spatial relationships within an image, while also accommodating variable receptive fields. In contrast to conventional self-attention mechanisms, DLSA functions uniformly across the entire image. This dynamic approach empowers each spatial location to selectively attend to its nearby regions based on contextual cues. Specifically, given input features <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> generated by layer normalization, 1 &#xd7; 1 convolution is performed for feature aggregation. Similar to (<xref ref-type="bibr" rid="B17">Li et&#xa0;al., 2023a</xref>), we introduce a squeeze and excitation network (SENet) (<xref ref-type="bibr" rid="B9">Hu et&#xa0;al., 2018</xref>) as our dynamic weight generation network, which has no normalization layers and non-linear activations. Additionally, we employ a 3&#xd7;3 depth-wise convolutional layer in SENet to encode features, ensuring better calculation of dynamic attention for local attention.</p>
<p>The proposed dynamic weight generation formula is as <xref ref-type="disp-formula" rid="eq11">Equations 11</xref>-<xref ref-type="disp-formula" rid="eq13">13</xref>:</p>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>DCon</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mtext>in</mml:mtext>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:mtext>W</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>x</mml:mtext>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:mi>&#x211b;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents the reshaping function. In DLSA, we utilize learnable dynamic convolutions. Unlike traditional fixed kernels, learnable dynamic convolutional kernels offer greater flexibility and adaptability. Each pixel has a corresponding <italic>K</italic> &#xd7; <italic>K</italic> dynamic kernel for dynamic convolution. We divide the number of feature channels into <italic>G</italic> heads, and learn separate dynamic weights in parallel. For the generated pixel-wise weights <bold>W</bold>, we obtain the aggregated features using the following formula as <xref ref-type="disp-formula" rid="eq14">Equation 14</xref>:</p>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>W</mml:mi>
</mml:mstyle>
<mml:mo>&#x229b;</mml:mo>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im8">
<mml:mo>&#x229b;</mml:mo>
</mml:math>
</inline-formula> denotes the dynamic convolution operation using weight sharing across each channel.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Sparse non-local self-attention</title>
<p>Due to the fact that the dynamic estimated features generated by DLSA are based on fully convolutional operations, the efficiency of modeling global features is relatively low. To better perceive global features, we revisit the standard dot-product self-attention in Transformer (<xref ref-type="bibr" rid="B36">Zamir et&#xa0;al., 2022</xref>). However, this algorithm calculates attention maps based on fully connected operations for all query-key pairs. In our work, we develop SNSA to replace it, which leverages sparsity by selecting the top-k tokens (<xref ref-type="bibr" rid="B2">Chen et&#xa0;al., 2023</xref>) most relevant to the query, thus obtaining the most crucial information for computation. This approach avoids involving irrelevant information in the feature interaction process.</p>
<p>Specifically, we first perform feature aggregation by applying a 1 &#xd7; 1 convolution, followed by a depthwise convolution with filter size of 3&#xd7;3 pixels to encode per-channel contexts. This allows for self-attention computation across the three dimensions of query Q, key K, and value V, rather than spatial dimensions. Utilizing channel-wise similarity helps reduce memory consumption for efficient inference. Next, we compute the similarity between all pairs of queries and keys, and employ a selection strategy to mask out values with lower similarity, retaining those with higher similarity.</p>
<p>As shown in the <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>, k represents an adjustable parameter for dynamically setting the sparsity level. When k=70%, only the top 70% of elements with the highest scores are retained for activation, while the remaining 30% of elements are masked as 0. Finally, softmax is applied to normalize elements larger than the top-k, ensuring the output is a probability distribution. For elements with scores less than top-k, we use a scatter function to replace their probability at the given index with 0. This dynamic selection results in attention following a sparse distribution. Finally, matrix multiplication is used to multiply softmax with Value, which is then connected to the input residual through feature projection to obtain the final result.</p>
<p>The derivation formula for SNSA is as <xref ref-type="disp-formula" rid="eq15">Equation 15</xref>:</p>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>Q</mml:mi>
</mml:mstyle>
<mml:mo>,</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>K</mml:mi>
</mml:mstyle>
<mml:mo>,</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>V</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="script">S</mml:mi>
<mml:mrow>
<mml:mo stretchy="true">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x2133;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>Q</mml:mi>
</mml:mstyle>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>K</mml:mi>
</mml:mstyle>
<mml:mo>&#x22a4;</mml:mo>
</mml:msup>
</mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>V</mml:mi>
</mml:mstyle>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents the softmax operation, <italic>&#x3bb;</italic> is an optional temperature factor defined by <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>=</mml:mo>
<mml:msqrt>
<mml:mi>d</mml:mi>
</mml:msqrt>
</mml:mrow>
</mml:math>
</inline-formula>. Typically, multi-head attention is applied to each of the k new Q, K, and V, resulting in <italic>d</italic> = <italic>C/k</italic> channel dimension outputs, which are then concatenated and projected linearly to obtain the final result of all heads.</p>
<disp-formula id="eq16">
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x2133;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2133;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>&#x2009;top</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>k&#x2009;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>row&#x2009;j</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mtext>&#x2009;otherwise&#x2009;</mml:mtext>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2133;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the top-k selection operator in <xref ref-type="disp-formula" rid="eq16">Equation 16</xref>.</p>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Feed-forward network</title>
<p>To extract sophisticated features from both the local and global self-attention data of the model and facilitate the learning of abstract representations, we introduce the FFN following the DLSA and SNSA modules. Specifically, we design two branches based on gating mechanisms. It first uses 1&#xd7;1 convolutions for feature transformation and then employs 3 &#xd7; 3 depth-wise convolutions to encode information from spatially adjacent pixel positions. One branch is used to expand feature channels, while in the other branch, it is activated along with the Gelu nonlinearity to reduce the channels back to the original input dimension and search for nonlinear contextual information in the hidden layers.</p>
<p>The FFN is formulated as <xref ref-type="disp-formula" rid="eq17">Equations 17</xref>-<xref ref-type="disp-formula" rid="eq19">19</xref>:</p>
<disp-formula id="eq17">
<label>(17)</label>
<mml:math display="block" id="M17">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq18">
<label>(18)</label>
<mml:math display="block" id="M18">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq19">
<label>(19)</label>
<mml:math display="block" id="M19">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>X</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mtext>Con</mml:mtext>
<mml:msub>
<mml:mtext>v</mml:mtext>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In general, FFN plays a distinctly different role compared to self-attention. It controls the flow of information passing through various levels of our pipeline, allowing each level to focus on complementary contextual information to other levels.</p>
</sec>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Loss function</title>
<p>Building upon existing methods, we adopt the L1 loss function as the loss function for our model. The expression for the L1 loss function is defined as <xref ref-type="disp-formula" rid="eq20">Equation 20</xref> :</p>
<disp-formula id="eq20">
<label>(20)</label>
<mml:math display="block" id="M20">
<mml:mrow>
<mml:mi>&#x2112;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2551;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>&#x2551;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>N</italic> is the number of samples in the dataset. <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the ground truth value for the <italic>i</italic>-th sample. <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the predicted value for the <italic>i</italic>-th sample.</p>
<p>The L1 loss function calculates the mean absolute error between the predicted values and the ground truth values, providing a measure of the average magnitude of the errors.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<p>In this section, we first introduce the implementation details, datasets and evaluation metrics. Then, we compare the proposed HDT-Net with 10 baseline methods, including bicubic, SRCNN (<xref ref-type="bibr" rid="B5">Dong et&#xa0;al., 2015</xref>), DSRCNN (<xref ref-type="bibr" rid="B26">Mao et&#xa0;al., 2016</xref>), SRGAN (<xref ref-type="bibr" rid="B15">Ledig et&#xa0;al., 2017</xref>), SRDM-GAN (<xref ref-type="bibr" rid="B10">Islam et&#xa0;al., 2020a</xref>), RFDN (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2020</xref>), LatticeNet+ (<xref ref-type="bibr" rid="B25">Luo et&#xa0;al., 2020</xref>), SMSR (<xref ref-type="bibr" rid="B32">Wang et&#xa0;al., 2021</xref>), IPT (<xref ref-type="bibr" rid="B1">Chen et&#xa0;al., 2021</xref>), and SwinIR (<xref ref-type="bibr" rid="B21">Liang et&#xa0;al., 2021</xref>). Finally, ablation experiments are conducted to validate the effectiveness of the proposed method. The experiments are trained on a server with two NVIDIA GeForce RTX 3090 GPUs.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental settings</title>
<sec id="s4_1_1">
<label>4.1.1</label>
<title>Implementation details</title>
<p>In the proposed SNSA, the threshold for top-k is set to 70%. We will analyze its impact in the ablation study. During the training, the batch size and patch size are configured as 16 and 64, respectively. The number of multi-head self-attention is set to be 6, and the number of feature is set to be 90. We utilize the Adam optimizer (<xref ref-type="bibr" rid="B14">Kingma and Ba, 2014</xref>) with default parameter configurations to train our model. The initial learning rate is established at 5 &#xd7; 10<sup>&#x2212;4</sup>, employing a multi-step scheduler over 500K iterations.</p>
</sec>
<sec id="s4_1_2">
<label>4.1.2</label>
<title>Datasets and evaluation metrics</title>
<p>We validate the performance of various methods using the classic underwater image super-resolution benchmark datasets, USR-248 and UFO-120 (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2024</xref>). Each dataset showcases distinct underwater degradation characteristics, enabling comprehensive evaluation across diverse underwater imaging scenarios. Consistent with previous studies, we utilize PSNR (Peak Signal-to-Noise Ratio) and SSIM (Structural Similarity Index) scores (<xref ref-type="bibr" rid="B31">Wang et&#xa0;al., 2004</xref>) to quantitatively compare the restoration results of different algorithms, enabling performance evaluation. In addition, we conduct evaluation calculations on the model parameter quantities of different deep networks.</p>
</sec>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Quantitative evaluation</title>
<p>Following (<xref ref-type="bibr" rid="B4">Dharejo et&#xa0;al., 2024</xref>), <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> presents the quantitative results of various methods on the USR-248 and UFO-120 datasets, including experimental setups with three different super-resolution scaling factors: &#xd7;2, &#xd7;4, and &#xd7;8. As shown, the experimental results demonstrate that our proposed HDT-Net consistently achieves the best quantitative performance. Compared to the state-of-the-art method SwinIR (<xref ref-type="bibr" rid="B21">Liang et&#xa0;al., 2021</xref>), our approach shows an average improvement of 0.5dB in PSNR, with a reduction in parameters by 58%. This indicates that our proposed hybrid transformer, as opposed to window-based transformers, can better capture feature correlations. Particularly challenging is the task of image super-resolution at a scaling factor of &#xd7;8. In contract, our proposed solution, leveraging the efficient fusion of local and global information, exhibits robust performance advantages in complex underwater scenes.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Quantitative comparisons of different methods on the USR-248 and UFO-120 datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="bottom" rowspan="2" align="center">Methods</th>
<th valign="bottom" align="center"/>
<th valign="bottom" colspan="2" align="center">USR-248</th>
<th valign="bottom" colspan="2" align="center">UFO-120</th>
<th valign="bottom" colspan="2" align="center">Average</th>
<th valign="bottom" rowspan="2" align="center">Params(M)</th>
</tr>
<tr>
<th valign="bottom" align="left">Scale</th>
<th valign="bottom" align="left">PSNR</th>
<th valign="bottom" align="left">SSIM</th>
<th valign="bottom" align="left">PSNR</th>
<th valign="bottom" align="left">SSIM</th>
<th valign="bottom" align="left">PSNR</th>
<th valign="bottom" align="left">SSIM</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="bottom" align="center">Bicubic</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">26.78</td>
<td valign="bottom" align="center">0.8263</td>
<td valign="bottom" align="center">27.01</td>
<td valign="bottom" align="center">0.8465</td>
<td valign="bottom" align="center">26.89</td>
<td valign="bottom" align="center">0.8364</td>
<td valign="bottom" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="bottom" align="center">SRCNN</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">27.89</td>
<td valign="bottom" align="center">0.8467</td>
<td valign="bottom" align="center">27.12</td>
<td valign="bottom" align="center">0.8654</td>
<td valign="bottom" align="center">27.50</td>
<td valign="bottom" align="center">0.8560</td>
<td valign="bottom" align="center">0.067</td>
</tr>
<tr>
<td valign="bottom" align="center">DSRCNN</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">28.12</td>
<td valign="bottom" align="center">0.8584</td>
<td valign="bottom" align="center">27.88</td>
<td valign="bottom" align="center">0.8731</td>
<td valign="bottom" align="center">28.00</td>
<td valign="bottom" align="center">0.8657</td>
<td valign="bottom" align="center">0.361</td>
</tr>
<tr>
<td valign="bottom" align="center">SRGAN</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">28.41</td>
<td valign="bottom" align="center">0.8612</td>
<td valign="bottom" align="center">28.54</td>
<td valign="bottom" align="center">0.8815</td>
<td valign="bottom" align="center">28.47</td>
<td valign="bottom" align="center">0.8713</td>
<td valign="bottom" align="center">1.54</td>
</tr>
<tr>
<td valign="bottom" align="center">SRDM-GAN</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">28.51</td>
<td valign="bottom" align="center">0.8592</td>
<td valign="bottom" align="center">28.58</td>
<td valign="bottom" align="center">0.8823</td>
<td valign="bottom" align="center">28.54</td>
<td valign="bottom" align="center">0.8707</td>
<td valign="bottom" align="center">0.586</td>
</tr>
<tr>
<td valign="bottom" align="center">RFDN</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">28.72</td>
<td valign="bottom" align="center">0.8633</td>
<td valign="bottom" align="center">28.81</td>
<td valign="bottom" align="center">0.8841</td>
<td valign="bottom" align="center">28.76</td>
<td valign="bottom" align="center">0.8737</td>
<td valign="bottom" align="center">0.528</td>
</tr>
<tr>
<td valign="bottom" align="center">LatticeNet+</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">28.74</td>
<td valign="bottom" align="center">0.8714</td>
<td valign="bottom" align="center">28.85</td>
<td valign="bottom" align="center">0.8854</td>
<td valign="bottom" align="center">28.79</td>
<td valign="bottom" align="center">0.8784</td>
<td valign="bottom" align="center">0.75</td>
</tr>
<tr>
<td valign="bottom" align="center">SMSR</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">28.88</td>
<td valign="bottom" align="center">0.8712</td>
<td valign="bottom" align="center">28.91</td>
<td valign="bottom" align="center">0.8862</td>
<td valign="bottom" align="center">28.89</td>
<td valign="bottom" align="center">0.8787</td>
<td valign="bottom" align="center">0.985</td>
</tr>
<tr>
<td valign="bottom" align="center">IPT</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">29.33</td>
<td valign="bottom" align="center">0.8831</td>
<td valign="bottom" align="center">29.05</td>
<td valign="bottom" align="center">0.8921</td>
<td valign="bottom" align="center">29.19</td>
<td valign="bottom" align="center">0.8876</td>
<td valign="bottom" align="center">11.3</td>
</tr>
<tr>
<td valign="bottom" align="center">SwinIR</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">29.88</td>
<td valign="bottom" align="center">0.9018</td>
<td valign="bottom" align="center">30.01</td>
<td valign="bottom" align="center">0.9021</td>
<td valign="bottom" align="center">29.94</td>
<td valign="bottom" align="center">0.9019</td>
<td valign="bottom" align="center">11.45</td>
</tr>
<tr>
<td valign="bottom" align="center">Ours</td>
<td valign="bottom" align="center">x2</td>
<td valign="bottom" align="center">
<bold>31.23</bold>
</td>
<td valign="bottom" align="center">
<bold>0.9217</bold>
</td>
<td valign="bottom" align="center">
<bold>31.54</bold>
</td>
<td valign="bottom" align="center">
<bold>0.9168</bold>
</td>
<td valign="bottom" align="center">
<bold>31.38</bold>
</td>
<td valign="bottom" align="center">
<bold>0.9192</bold>
</td>
<td valign="bottom" align="center">4.71</td>
</tr>
<tr>
<td valign="bottom" align="center">Bicubic</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">25.07</td>
<td valign="bottom" align="center">0.7823</td>
<td valign="bottom" align="center">25.12</td>
<td valign="bottom" align="center">0.8165</td>
<td valign="bottom" align="center">25.09</td>
<td valign="bottom" align="center">0.7994</td>
<td valign="bottom" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="bottom" align="center">SRCNN</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">25.17</td>
<td valign="bottom" align="center">0.7978</td>
<td valign="bottom" align="center">25.21</td>
<td valign="bottom" align="center">0.8157</td>
<td valign="bottom" align="center">25.19</td>
<td valign="bottom" align="center">0.8067</td>
<td valign="bottom" align="center">0.067</td>
</tr>
<tr>
<td valign="bottom" align="center">DSRCNN</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">25.78</td>
<td valign="bottom" align="center">0.8064</td>
<td valign="bottom" align="center">26.81</td>
<td valign="bottom" align="center">0.8177</td>
<td valign="bottom" align="center">26.29</td>
<td valign="bottom" align="center">0.8120</td>
<td valign="bottom" align="center">0.361</td>
</tr>
<tr>
<td valign="bottom" align="center">SRGAN</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">26.09</td>
<td valign="bottom" align="center">0.8178</td>
<td valign="bottom" align="center">26.14</td>
<td valign="bottom" align="center">0.8188</td>
<td valign="bottom" align="center">26.11</td>
<td valign="bottom" align="center">0.8183</td>
<td valign="bottom" align="center">1.54</td>
</tr>
<tr>
<td valign="bottom" align="center">SRDM-GAN</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">26.19</td>
<td valign="bottom" align="center">0.8211</td>
<td valign="bottom" align="center">26.51</td>
<td valign="bottom" align="center">0.8247</td>
<td valign="bottom" align="center">26.35</td>
<td valign="bottom" align="center">0.8229</td>
<td valign="bottom" align="center">0.586</td>
</tr>
<tr>
<td valign="bottom" align="center">RFDN</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">26.66</td>
<td valign="bottom" align="center">0.8216</td>
<td valign="bottom" align="center">26.81</td>
<td valign="bottom" align="center">0.8350</td>
<td valign="bottom" align="center">26.73</td>
<td valign="bottom" align="center">0.8283</td>
<td valign="bottom" align="center">0.528</td>
</tr>
<tr>
<td valign="bottom" align="center">LatticeNet+</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">26.78</td>
<td valign="bottom" align="center">0.8239</td>
<td valign="bottom" align="center">26.85</td>
<td valign="bottom" align="center">0.8245</td>
<td valign="bottom" align="center">26.81</td>
<td valign="bottom" align="center">0.8242</td>
<td valign="bottom" align="center">0.75</td>
</tr>
<tr>
<td valign="bottom" align="center">SMSR</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">27.07</td>
<td valign="bottom" align="center">0.8296</td>
<td valign="bottom" align="center">27.15</td>
<td valign="bottom" align="center">0.8310</td>
<td valign="bottom" align="center">27.11</td>
<td valign="bottom" align="center">0.8303</td>
<td valign="bottom" align="center">0.985</td>
</tr>
<tr>
<td valign="bottom" align="center">IPT</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">27.11</td>
<td valign="bottom" align="center">0.8626</td>
<td valign="bottom" align="center">27.16</td>
<td valign="bottom" align="center">0.8632</td>
<td valign="bottom" align="center">27.13</td>
<td valign="bottom" align="center">0.8629</td>
<td valign="bottom" align="center">11.3</td>
</tr>
<tr>
<td valign="bottom" align="center">SwinIR</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">27.18</td>
<td valign="bottom" align="center">0.8634</td>
<td valign="bottom" align="center">27.27</td>
<td valign="bottom" align="center">0.8644</td>
<td valign="bottom" align="center">27.22</td>
<td valign="bottom" align="center">0.8639</td>
<td valign="bottom" align="center">11.45</td>
</tr>
<tr>
<td valign="bottom" align="center">Ours</td>
<td valign="bottom" align="center">x4</td>
<td valign="bottom" align="center">
<bold>27.69</bold>
</td>
<td valign="bottom" align="center">
<bold>0.8712</bold>
</td>
<td valign="bottom" align="center">
<bold>27.82</bold>
</td>
<td valign="bottom" align="center">
<bold>0.8745</bold>
</td>
<td valign="bottom" align="center">
<bold>27.75</bold>
</td>
<td valign="bottom" align="center">
<bold>0.8728</bold>
</td>
<td valign="bottom" align="center">4.71</td>
</tr>
<tr>
<td valign="bottom" align="center">Bicubic</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">23.46</td>
<td valign="bottom" align="center">0.7684</td>
<td valign="bottom" align="center">23.84</td>
<td valign="bottom" align="center">0.7781</td>
<td valign="bottom" align="center">23.65</td>
<td valign="bottom" align="center">0.7732</td>
<td valign="bottom" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="bottom" align="center">SRCNN</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">24.07</td>
<td valign="bottom" align="center">0.7877</td>
<td valign="bottom" align="center">24.12</td>
<td valign="bottom" align="center">0.7981</td>
<td valign="bottom" align="center">24.09</td>
<td valign="bottom" align="center">0.7929</td>
<td valign="bottom" align="center">0.067</td>
</tr>
<tr>
<td valign="bottom" align="center">DSRCNN</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">24.12</td>
<td valign="bottom" align="center">0.7987</td>
<td valign="bottom" align="center">24.18</td>
<td valign="bottom" align="center">0.8031</td>
<td valign="bottom" align="center">24.15</td>
<td valign="bottom" align="center">0.8009</td>
<td valign="bottom" align="center">0.361</td>
</tr>
<tr>
<td valign="bottom" align="center">SRGAN</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">24.22</td>
<td valign="bottom" align="center">0.8021</td>
<td valign="bottom" align="center">24.29</td>
<td valign="bottom" align="center">0.8024</td>
<td valign="bottom" align="center">24.25</td>
<td valign="bottom" align="center">0.8022</td>
<td valign="bottom" align="center">1.54</td>
</tr>
<tr>
<td valign="bottom" align="center">SRDM-GAN</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">24.41</td>
<td valign="bottom" align="center">0.8162</td>
<td valign="bottom" align="center">24.47</td>
<td valign="bottom" align="center">0.8178</td>
<td valign="bottom" align="center">24.44</td>
<td valign="bottom" align="center">0.8170</td>
<td valign="bottom" align="center">0.586</td>
</tr>
<tr>
<td valign="bottom" align="center">RFDN</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">24.55</td>
<td valign="bottom" align="center">0.8178</td>
<td valign="bottom" align="center">24.67</td>
<td valign="bottom" align="center">0.8218</td>
<td valign="bottom" align="center">24.61</td>
<td valign="bottom" align="center">0.8198</td>
<td valign="bottom" align="center">0.528</td>
</tr>
<tr>
<td valign="bottom" align="center">LatticeNet+</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">25.08</td>
<td valign="bottom" align="center">0.8321</td>
<td valign="bottom" align="center">25.11</td>
<td valign="bottom" align="center">0.8324</td>
<td valign="bottom" align="center">25.09</td>
<td valign="bottom" align="center">0.8322</td>
<td valign="bottom" align="center">0.75</td>
</tr>
<tr>
<td valign="bottom" align="center">SMSR</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">25.16</td>
<td valign="bottom" align="center">0.8344</td>
<td valign="bottom" align="center">25.23</td>
<td valign="bottom" align="center">0.8354</td>
<td valign="bottom" align="center">25.19</td>
<td valign="bottom" align="center">0.8349</td>
<td valign="bottom" align="center">0.985</td>
</tr>
<tr>
<td valign="bottom" align="center">IPT</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">25.22</td>
<td valign="bottom" align="center">0.8353</td>
<td valign="bottom" align="center">25.34</td>
<td valign="bottom" align="center">0.8411</td>
<td valign="bottom" align="center">25.28</td>
<td valign="bottom" align="center">0.8382</td>
<td valign="bottom" align="center">11.3</td>
</tr>
<tr>
<td valign="bottom" align="center">SwinIR</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">25.82</td>
<td valign="bottom" align="center">0.8555</td>
<td valign="bottom" align="center">26.04</td>
<td valign="bottom" align="center">0.8559</td>
<td valign="bottom" align="center">25.93</td>
<td valign="bottom" align="center">0.8557</td>
<td valign="bottom" align="center">11.45</td>
</tr>
<tr>
<td valign="bottom" align="center">Ours</td>
<td valign="bottom" align="center">x8</td>
<td valign="bottom" align="center">
<bold>26.37</bold>
</td>
<td valign="bottom" align="center">
<bold>0.8662</bold>
</td>
<td valign="bottom" align="center">
<bold>26.48</bold>
</td>
<td valign="bottom" align="center">
<bold>0.8655</bold>
</td>
<td valign="bottom" align="center">
<bold>26.42</bold>
</td>
<td valign="bottom" align="center">
<bold>0.8658</bold>
</td>
<td valign="bottom" align="center">4.71</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold indicates the best results.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Qualitative evaluation</title>
<p>
<xref ref-type="fig" rid="f3">
<bold>Figures&#xa0;3</bold>
</xref>, <xref ref-type="fig" rid="f4">
<bold>4</bold>
</xref> illustrate the visual comparison results of different methods on the USR-248 and UFO-120 datasets, respectively. Note that we do not compare RFDN (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2020</xref>) and LatticeNet+ (<xref ref-type="bibr" rid="B25">Luo et&#xa0;al., 2020</xref>) as their visual results are not available. It is evident that effectively enhancing image resolution quality in complex underwater environments presents a formidable challenge compared to natural images. We find that the restoration results of most Transformer-based approaches tend to smooth out the details and textures of the images, which is attributed to the dense pattern of self-attention mechanisms. Furthermore, window-based self-attention global modeling methods fail to effectively aggregate information outside the window, thus affecting the quality of the restored images, as observed in SwinIR (<xref ref-type="bibr" rid="B21">Liang et&#xa0;al., 2021</xref>). In contrast, our proposed method achieves better image restoration by exploring the aggregation of local and global information. These quantitative and qualitative results indicate the effectiveness of the proposed hybrid dynamic Transformers, providing new insights into the challenging task of underwater image super-resolution.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Image super-resolution comparisons for different methods on the USR-248 dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1389553-g003.tif"/>
</fig>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Image super-resolution comparisons for different methods on the UFO-120 dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1389553-g004.tif"/>
</fig>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Ablation study</title>
<p>In this section, we conduct a further analysis of the impact of the components proposed in our method and compare it against baseline models. To ensure a fair comparison, we employ the same settings used to train all baseline models as those of the proposed method. Here, we conduct ablation experiments with &#xd7;2 super-resolution on the USR-248 dataset. Specifically, the ablation study includes (1) effectiveness of the DLSA and SNSA, (2) effect of top-k values in the SNSA, and (3) effect of the number of HDTMs.</p>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Effectiveness of the DLSA and SNSA</title>
<p>First, we analyze the effectiveness of the two key components proposed in the method, including DLSA and SNSA. To do this, we separately remove one of the components for comparative analysis. <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref> presents the quantitative results of different variant models. It can be seen that our approach combining DLSA and SNSA achieves the best performance. <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref> illustrates the visual comparison results of different ablation models. It can be observed that, compared to using only a single self-attention mechanism for feature modeling, our proposed method can better restore the structure and detail regions of underwater images. The combination of local and non-local self-attention mechanisms enables the model to strike a balance between enhancing local details and preserving the overall scene context, resulting in more accurate and coherent super-resolved images.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Quantitative comparison of ablation results about the effectiveness of DLSA and SNSA.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Models</th>
<th valign="top" align="center">w/o DLSA</th>
<th valign="top" align="center">w/o SNSA</th>
<th valign="top" align="center">Ours</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">PSNR/SSIM</td>
<td valign="top" align="center">30.48/0.9060</td>
<td valign="top" align="center">29.63/0.8958</td>
<td valign="top" align="center">
<bold>31.23/0.9217</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Visual comparison of ablation results about the effectiveness of DLSA and SNSA.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1389553-g005.tif"/>
</fig>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Effect of top-k values in the SNSA</title>
<p>Next, we analyze the impact of the top-k value in SNSA. Regarding the choice of sparsity value, it also plays a crucial role in determining the performance of the model. A smaller sparsity value may result in a dense attention map, which could lead to increased computational overhead and potential overfitting to noisy or irrelevant features. On the other hand, a larger sparsity value may cause the model to miss important global context or relevant features. Therefore, selecting an optimal sparsity value, such as k=70% in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, strikes a balance between capturing sufficient global information and maintaining computational efficiency, ultimately contributing to improved performance in underwater image super-resolution tasks.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Comparison of ablation results about the effect of top-k values in the SNSA.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1389553-g006.tif"/>
</fig>
</sec>
<sec id="s4_4_3">
<label>4.4.3</label>
<title>Effect of the number of HDTMs</title>
<p>Finally, we analyze the impact of the number of HDTMs in the network backbone. <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref> presents the quantitative results using different numbers of HDTMs. It can be observed that when the number ranges from 6 to 8, the growth of PSNR value gradually converges. Therefore, to balance model efficiency and performance, we ultimately choose <italic>N</italic> = 6 as the configuration for the final network.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Comparison of ablation results about the effect of the number of HDTMs.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-11-1389553-g007.tif"/>
</fig>
</sec>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Limitations</title>
<p>While our proposed method demonstrates superior performance on classical underwater image super-resolution datasets (visible data) (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2024</xref>), its applicability is currently somewhat limited. The model&#x2019;s performance is significantly affected in scenarios with low light conditions, such as deep-sea environments or areas with poor visibility, where methods utilizing sonar (<xref ref-type="bibr" rid="B35">Yang, 2023</xref>; <xref ref-type="bibr" rid="B37">Zhang et&#xa0;al., 2024</xref>) for detection are more prevalent. To adapt our method to a wider range of underwater scenarios, we will explore the potential applications of the proposed method in sonar images.</p>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusions</title>
<p>In this paper, we have proposed an effective hybrid dynamic Transformer for underwater image super-resolution. We demonstrate the crucial importance of jointly exploring local features and global information in underwater image reconstruction for achieving high-quality results. At the technical level, we integrate dynamic local self-attention and sparse non-local self-attention to stack into the hybrid dynamic transformer module, forming the backbone of our proposed method. The former effectively captures details in underwater image regions, while the latter aids in the recovery of global image structure and color. Our proposed method achieves satisfactory reconstruction results on benchmark datasets. In future work, we will explore the extension of this hybrid transformer approach to other navigation-related visual tasks.</p>
</sec>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <uri xlink:href="https://drive.google.com/drive/folders/1dCe5rlw3UpzBs25UMXek1JL0wBBa697Q">https://drive.google.com/drive/folders/1dCe5rlw3UpzBs25UMXek1JL0wBBa697Q</uri>; <uri xlink:href="https://www.v7labs.com/open-datasets/ufo-120">https://www.v7labs.com/open-datasets/ufo-120</uri>.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>XH: Data curation, Formal analysis, Methodology, Writing &#x2013; original draft. JL: Investigation, Writing &#x2013; review &amp; editing. TJ: Data curation, Software, Visualization, Writing &#x2013; review &amp; editing.</p>
</sec>
</body>
<back>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was partly supported by the Research Project of the Naval Staff Navigation Assurance Bureau (2023(1)).</p>
</sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Pre-trained image processing transformer</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>Virtual</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>12299</fpage>&#x2013;<lpage>12310</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Learning a sparse transformer network for effective image deraining</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name> (<publisher-loc>Canada</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>5896</fpage>&#x2013;<lpage>5905</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00571</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Niu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>A wavelet based deep learning method for underwater image super resolution reconstruction</article-title>,&#x201d; in <conf-name>IEEE Access</conf-name> (<publisher-name>IEEE</publisher-name>), <volume>8</volume>, <fpage>117759</fpage>&#x2013;<lpage>117769</lpage>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dharejo</surname> <given-names>F. A.</given-names>
</name>
<name>
<surname>Ganapathi</surname> <given-names>I. I.</given-names>
</name>
<name>
<surname>Zawish</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Alawode</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Alathbah</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Werghi</surname> <given-names>N.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Swinwave-sr: Multi-scale lightweight underwater image super-resolution</article-title>. <source>Inf. Fusion</source> <volume>103</volume>, <fpage>102127</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.inffus.2023.102127</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dong</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Loy</surname> <given-names>C. C.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Image super-resolution using deep convolutional networks</article-title>,&#x201d; in <conf-name>IEEE transactions on pattern analysis and machine intelligence</conf-name> (<publisher-name>IEEE</publisher-name>), <volume>38</volume>, <fpage>295</fpage>&#x2013;<lpage>307</lpage>.</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Weissenborn</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>. <source>arXiv preprint arXiv:2010.11929</source>.</citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Guo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhuang</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Underwater image enhancement using a multiscale dense generative adversarial network</article-title>,&#x201d; in <conf-name>IEEE Journal of Oceanic Engineering</conf-name> (<publisher-name>IEEE</publisher-name>), <volume>45</volume>, <fpage>862</fpage>&#x2013;<lpage>870</lpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). &#x201c;<article-title>A survey on vision transformer</article-title>,&#x201d; in <conf-name>IEEE transactions on pattern analysis and machine intelligence</conf-name> (<publisher-name>IEEE</publisher-name>), <volume>45</volume>, <fpage>87</fpage>&#x2013;<lpage>110</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Squeeze-and-excitation networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>.  (<publisher-loc>USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>7132</fpage>&#x2013;<lpage>7141</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Islam</surname> <given-names>M. J.</given-names>
</name>
<name>
<surname>Enan</surname> <given-names>S. S.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Sattar</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>a). &#x201c;<article-title>Underwater image super-resolution using deep residual multipliers</article-title>,&#x201d; in <conf-name>2020 IEEE International Conference on Robotics and Automation (ICRA)</conf-name>.  (<publisher-name>Virtual: IEEE</publisher-name>), <fpage>900</fpage>&#x2013;<lpage>906</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICRA40945.2020</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Islam</surname> <given-names>M. J.</given-names>
</name>
<name>
<surname>Xia</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sattar</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>b). &#x201c;<article-title>Fast underwater image enhancement for improved visual perception</article-title>,&#x201d; in <conf-name>IEEE Robotics and Automation Letters</conf-name>. (<publisher-loc>Virtual</publisher-loc>: <publisher-name>IEEE</publisher-name>), <volume>5</volume>, <fpage>3227</fpage>&#x2013;<lpage>3234</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LSP.2016.</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>C.-W.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Underwater image enhancement with lightweight cascaded network</article-title>,&#x201d; in <conf-name>IEEE transactions on multimedia</conf-name>. (<publisher-name>IEEE</publisher-name>) <volume>24</volume>, <fpage>4301</fpage>&#x2013;<lpage>4313</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kim</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J. K.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>K. M.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Accurate image super-resolution using very deep convolutional networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>USA</publisher-loc>: <publisher-name>IEEE</publisher-name>) <fpage>1646</fpage>&#x2013;<lpage>1654</lpage>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kingma</surname> <given-names>D. P.</given-names>
</name>
<name>
<surname>Ba</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Adam: A method for stochastic optimization</article-title>. <source>arXiv preprint arXiv:1412.6980</source>.</citation>
</ref>
<ref id="B15">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ledig</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Theis</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Husz&#xe1;r</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Caballero</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cunningham</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Acosta</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). &#x201c;<article-title>Photo-realistic single image super-resolution using a generative adversarial network</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4681</fpage>&#x2013;<lpage>4690</lpage>.</citation>
</ref>
<ref id="B16">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Anwar</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cong</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Underwater image enhancement via medium transmission-guided multi-color space embedding</article-title>,&#x201d; in <conf-name>IEEE Transactions on Image Processing</conf-name>. (<publisher-name>IEEE</publisher-name>) <volume>30</volume>, <fpage>4985</fpage>&#x2013;<lpage>5000</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>a). &#x201c;<article-title>Dlgsanet: lightweight dynamic local and global self-attention networks for image super-resolution</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>. (<publisher-loc>France</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>12792</fpage>&#x2013;<lpage>12801</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV51070.2023.01175</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Cong</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Hou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kwong</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). &#x201c;<article-title>An underwater image enhancement benchmark dataset and beyond</article-title>,&#x201d; in <conf-name>IEEE Transactions on Image Processing</conf-name>. (<publisher-name>IEEE</publisher-name>) <volume>29</volume>, <fpage>4376</fpage>&#x2013;<lpage>4389</lpage>.</citation>
</ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhuang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>b). &#x201c;<article-title>Ruiesr: Realistic underwater image enhancement and super resolution</article-title>,&#x201d; in <conf-name>IEEE Transactions on Circuits and Systems for Video Technology</conf-name>. (<publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TCSVT.2023.3328785</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Skinner</surname> <given-names>K. A.</given-names>
</name>
<name>
<surname>Eustice</surname> <given-names>R. M.</given-names>
</name>
<name>
<surname>Johnson-Roberson</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Watergan: Unsupervised generative network to enable real-time color correction of monocular underwater images</article-title>,&#x201d; in <conf-name>IEEE Robotics and Automation letters</conf-name>. (<publisher-name>IEEE</publisher-name>) <volume>3</volume>, <fpage>387</fpage>&#x2013;<lpage>394</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Van Gool</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Timofte</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Swinir: Image restoration using swin transformer</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. (<publisher-loc>Virtual</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1833</fpage>&#x2013;<lpage>1844</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lim</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Son</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Nah</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Mu Lee</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Enhanced deep residual networks for single image super-resolution</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition workshops</conf-name>. (<publisher-loc>USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>136</fpage>&#x2013;<lpage>144</lpage>.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Ning</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Multi-scale dense spatially-adaptive residual distillation network for lightweight underwater image super-resolution</article-title>. <source>Front. Mar. Sci.</source> <volume>10</volume>, <elocation-id>1328436</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fmars.2023.1328436</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Residual feature distillation network for lightweight image superresolution</article-title>,&#x201d; in <conf-name>Computer Vision&#x2013;ECCV 2020 Workshops</conf-name>, <conf-loc>UK</conf-loc>, <conf-date>2020</conf-date>. <fpage>41</fpage>&#x2013;<lpage>55</lpage>, Proceedings, Part III 16 (<publisher-loc>UK</publisher-loc>: <publisher-name>Springer</publisher-name>).</citation>
</ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Luo</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Qu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Latticenet: Towards lightweight image superresolution with lattice block</article-title>,&#x201d; in <conf-name>Computer Vision&#x2013;ECCV 2020: 16th European Conference</conf-name>, <conf-date>2020</conf-date>. <fpage>272</fpage>&#x2013;<lpage>289</lpage>, Proceedings, Part XXII 16 (<publisher-loc>UK</publisher-loc>: <publisher-name>Springer</publisher-name>).</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mao</surname> <given-names>X.-J.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y.-B.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Image restoration using convolutional auto-encoders with symmetric skip connections</article-title>. <source>arXiv preprint arXiv:1606.08921</source>..</citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Park</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Soh</surname> <given-names>J. W.</given-names>
</name>
<name>
<surname>Cho</surname> <given-names>N. I.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Dynamic residual self-attention network for lightweight single image super-resolution</article-title>,&#x201d; in <conf-name>IEEE Transactions on Multimedia</conf-name>. <volume>25</volume>, <fpage>907</fpage>&#x2013;<lpage>918</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shi</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Caballero</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Husz&#xe1;r</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Totz</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Aitken</surname> <given-names>A. P.</given-names>
</name>
<name>
<surname>Bishop</surname> <given-names>R.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). &#x201c;<article-title>Real-time single image and video super-resolution using an efficient sub-pixel convolutional neural network</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1874</fpage>&#x2013;<lpage>1883</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shin</surname> <given-names>Y.-S.</given-names>
</name>
<name>
<surname>Cho</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Pandey</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Estimation of ambient light and transmission map with common convolutional architecture</article-title>,&#x201d; in <conf-name>OCEANS 2016 MTS/IEEE Monterey (IEEE)</conf-name>. (<publisher-name>IEEE</publisher-name>) <volume>2016</volume>, <fpage>1</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vaswani</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Shazeer</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Parmar</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Uszkoreit</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jones</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Gomez</surname> <given-names>A. N.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Attention is all you need</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>30</volume>.</citation>
</ref>
<ref id="B31">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Bovik</surname> <given-names>A. C.</given-names>
</name>
<name>
<surname>Sheikh</surname> <given-names>H. R.</given-names>
</name>
<name>
<surname>Simoncelli</surname> <given-names>E. P.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Image quality assessment: from error visibility to structural similarity</article-title>,&#x201d; in <conf-name>IEEE transactions on image processing</conf-name>. (<publisher-name>IEEE</publisher-name>) <volume>13</volume>, <fpage>600</fpage>&#x2013;<lpage>612</lpage>.</citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ying</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>An</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Exploring sparsity in image super-resolution for efficient inference</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4917</fpage>&#x2013;<lpage>4926</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>A deep cnn method for underwater image enhancement</article-title>,&#x201d; in <conf-name>2017 IEEE international conference on image processing (ICIP)</conf-name>. (<publisher-name>IEEE</publisher-name>) <volume>2017</volume>, <fpage>1382</fpage>&#x2013;<lpage>1386</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>He</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>C.-W.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Ttst: A top-k token selective transformer for remote sensing image super-resolution</article-title>,&#x201d; in <conf-name>IEEE Transactions on Image Processing</conf-name>. (<publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIP.2023.3349004</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>An imaging algorithm for high-resolution imaging sonar system</article-title>. <source>Multimedia Tools Appl.</source> <volume>1&#x2013;17</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-023-16757-0</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zamir</surname> <given-names>S. W.</given-names>
</name>
<name>
<surname>Arora</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hayat</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>F. S.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>M.-H.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Restormer: Efficient transformer for high-resolution image restoration</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>5728</fpage>&#x2013;<lpage>5739</lpage>.</citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). &#x201c;<article-title>A novel multireceiver sas rd processor</article-title>,&#x201d; in <conf-name>IEEE Transactions on Geoscience and Remote Sensing</conf-name>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2024.3362886</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Efficient long-range attention network for image super-resolution</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name>, (<publisher-loc>Israel</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>649</fpage>&#x2013;<lpage>667</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>