<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1599968</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2025.1599968</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Target-aware unregistered infrared and visible image fusion</article-title>
<alt-title alt-title-type="left-running-head">Hu et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2025.1599968">10.3389/fphy.2025.1599968</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Dengshu</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Ke</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Cuijin</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Liu</surname>
<given-names>Zheng</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3015895/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Che</surname>
<given-names>Yukui</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/3015751/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Dong</surname>
<given-names>Shoubing</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kong</surname>
<given-names>Chuirui</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
</contrib-group>
<aff>
<institution>Qujing Power Supply Bureau</institution>, <institution>Yunnan Power Grid Co., Ltd.</institution>, <addr-line>Qujing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1298136/overview">Zhiqin Zhu</ext-link>, Chongqing University of Posts and Telecommunications, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3019142/overview">Fan Li</ext-link>, Kunming University of Science and Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3019205/overview">Haicheng Bai</ext-link>, Yunnan Normal University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Zheng Liu, <email>490956823@qq.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>06</day>
<month>06</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1599968</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>03</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>05</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Hu, Wang, Zhang, Liu, Che, Dong and Kong.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Hu, Wang, Zhang, Liu, Che, Dong and Kong</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Infrared (IR) and visible (VI) image fusion can provide richer texture details for subsequent object detection tasks. Conversely, object detection can offer semantic information about targets, which in turn helps improve the quality of the fused images. As a result, joint learning approaches that integrate infrared-visible image fusion and object detection have attracted increasing attention.</p>
</sec>
<sec>
<title>Methods</title>
<p>However, existing methods typically assume that the input source images are perfectly aligned spatially&#x2014;an assumption that does not hold in real-world applications. To address this issue, we propose a novel method that enables mutual enhancement between infrared-visible image fusion and object detection, specifically designed to handle misaligned source images. The core idea is to use the object detection loss, propagated via backpropagation, to guide the training of the fusion network, while a specially designed loss function mitigates the modality gap between infrared and visible images.</p>
</sec>
<sec>
<title>Results</title>
<p>Comprehensive experiments on three public datasets demonstrate the effectiveness of our approach.</p>
</sec>
<sec>
<title>Discussion</title>
<p>In addition, our approach can be used with other radiation frequencies where different modalities require image fusion like, for example, radio-frequency, x- and gamma rays used in medical imaging.</p>
</sec>
</abstract>
<kwd-group>
<kwd>infrared and visible image fusion</kwd>
<kwd>object detection</kwd>
<kwd>feature alignment</kwd>
<kwd>target-aware</kwd>
<kwd>unregistered</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Radiation Detectors and Imaging</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Images captured by a single sensor often fail to provide a comprehensive description of a scene. For example, infrared (IR) sensors can capture thermal radiation emitted by objects and highlight salient targets, but they lack the ability to represent fine texture details and are more susceptible to noise. On the other hand, visible-light (VI) sensors capture visual information with clear texture details but are easily affected by lighting conditions and occlusions. If the information from both infrared and visible images can be integrated into a single, information-rich fused image, the scene representation can be significantly enhanced. As a result, infrared and visible image fusion has been widely applied as a low-level preprocessing task in various high-level vision applications, such as object detection [<xref ref-type="bibr" rid="B1">1</xref>], tracking [<xref ref-type="bibr" rid="B2">2</xref>], person re-identification [<xref ref-type="bibr" rid="B3">3</xref>], and semantic segmentation [<xref ref-type="bibr" rid="B4">4</xref>]. An example in <xref ref-type="fig" rid="F1">Figure 1</xref> visually illustrates the application of fused images in object detection. It can be observed that detection results obtained from individual sensor images are less accurate than those derived from fused images.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Object detection results of the proposed method on the <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD dataset.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g001.tif"/>
</fig>
<p>Due to its practical value, infrared and visible image fusion has garnered substantial attention in the research community. Over the past decades, numerous image fusion techniques have been proposed, including both traditional and deep learning-based methods. Traditional methods typically fall into two categories: multi-scale transform-based methods [<xref ref-type="bibr" rid="B5">5</xref>&#x2013;<xref ref-type="bibr" rid="B7">7</xref>] and sparse representation-based methods [<xref ref-type="bibr" rid="B8">8</xref>&#x2013;<xref ref-type="bibr" rid="B12">12</xref>]. Deep learning-based approaches include methods based on autoencoders (AE) [<xref ref-type="bibr" rid="B9">9</xref>, <xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>], convolutional neural networks (CNNs) [<xref ref-type="bibr" rid="B15">15</xref>&#x2013;<xref ref-type="bibr" rid="B18">18</xref>], and generative adversarial networks (GANs) [<xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B20">20</xref>].</p>
<p>Although recent deep learning-based fusion algorithms can generate visually pleasing results, several critical challenges remain unsolved. On one hand, most existing fusion algorithms focus on optimizing visual quality and evaluation metrics, but rarely consider whether the fused results benefit downstream task performance. On the other hand, even recent methods that incorporate high-level vision tasks into the fusion process&#x2014;such as TarDAL [<xref ref-type="bibr" rid="B21">21</xref>], which proposes a dual-level optimization model using a task-aware dual adversarial learning network to simultaneously address fusion and object detection; SeAFusion [<xref ref-type="bibr" rid="B22">22</xref>], which constrains the fusion process with semantic loss to retain richer semantic information; and DetFusion [<xref ref-type="bibr" rid="B23">23</xref>], which guides multimodal fusion using target-related features learned by the object detection network&#x2014;still assume that the source images are perfectly aligned spatially. This assumption does not hold in real-world applications.</p>
<p>In this study, we propose a framework named Target-Aware Unregistered Infrared and Visible Image Fusion Network, designed to achieve robust performance in both misaligned image fusion and high-level vision tasks. Specifically, we introduce an object detection network to predict detection results on the fused image and construct a detection loss. This loss is then backpropagated to guide the training of the fusion network, encouraging the fused image to retain more information useful for object detection. Additionally, to effectively align unregistered images, we design a modality consistency loss to reduce the domain gap between infrared and visible images.</p>
<p>In summary, our main contributions are as follows:<list list-type="simple">
<list-item>
<p>(1) We are the first to unify unregistered image fusion and object detection within a single framework, breaking the limitations of object detection in real-world applications.</p>
</list-item>
<list-item>
<p>(2) We propose a modality consistency loss that effectively eliminates the domain discrepancy between infrared and visible images, improving image registration accuracy.</p>
</list-item>
<list-item>
<p>(3) Our method demonstrates excellent performance in image alignment, fusion, and object detection across multiple datasets. And our method can be used with other radiation frequencies where different modalities require image fusion like, for example, radio-frequency, x- and gamma rays used in medical imaging.</p>
</list-item>
</list>
</p>
<p>The rest of this paper is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> briefly reviews related work on high-level vision task-driven image fusion and unregistered infrared-visible image fusion. <xref ref-type="sec" rid="s3">Section 3</xref> describes the proposed method in detail. <xref ref-type="sec" rid="s4">Section 4</xref> presents and discusses the experimental results. <xref ref-type="sec" rid="s5">Section 5</xref> concludes the paper.</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>In this section, we first provide a brief overview of high-level vision task-driven infrared and visible image fusion methods, and then review existing approaches for unregistered infrared and visible image fusion.</p>
<sec id="s2-1">
<title>2.1 High-level vision task-driven infrared and visible image fusion</title>
<p>High-level vision task-driven fusion methods typically incorporate a semantic segmentation [<xref ref-type="bibr" rid="B24">24</xref>&#x2013;<xref ref-type="bibr" rid="B27">27</xref>] or object detection network [<xref ref-type="bibr" rid="B23">23</xref>, <xref ref-type="bibr" rid="B28">28</xref>] after the fusion network, using the loss functions from these downstream tasks to constrain the fusion results and improve the quality of the fused image. However, introducing high-level vision tasks at the fused image level only provides indirect guidance for the feature extraction network to learn features relevant to the downstream tasks.</p>
<p>To provide direct task-level guidance at the feature level and further enhance fusion performance, PSFusion [<xref ref-type="bibr" rid="B29">29</xref>] injects semantic features extracted from a segmentation task directly into the fusion network. SegMiF [<xref ref-type="bibr" rid="B25">25</xref>] feeds the fused result into a semantic segmentation network to extract semantic features, which are then interacted with the multimodal image features from the encoder to enhance the fusion result. MRFS [<xref ref-type="bibr" rid="B26">26</xref>] interacts and fuses the source image features before feeding them into a semantic segmentation head to enforce semantic supervision, thereby improving the global scene perception of the fusion network. MetaFusion [<xref ref-type="bibr" rid="B28">28</xref>] sends the fused result into an object detection network to extract features, which are then combined with the source image features and passed into a meta-feature generator to guide feature extraction in the fusion branch.</p>
<p>Although these methods improve fusion performance to some extent by leveraging downstream high-level tasks, they all assume that the input images are perfectly aligned in spatial position&#x2014;a condition rarely met in real-world applications. In practice, such methods rely on additional image registration algorithms to achieve accurate alignment before performing fusion. This not only makes the fusion quality highly dependent on the registration accuracy but also significantly increases the complexity of the overall network design.</p>
</sec>
<sec id="s2-2">
<title>2.2 Unregistered infrared and visible image fusion</title>
<p>To address the problem of unregistered infrared and visible image fusion, most existing approaches combine registration and fusion algorithms, i.e., first aligning the input misaligned image pairs and then performing fusion. However, due to the large modality gap between infrared and visible images, ignoring the adverse impact of modality discrepancy on registration can greatly degrade fusion quality. For instance, ReCoNet [<xref ref-type="bibr" rid="B30">30</xref>] adopts this strategy but produces suboptimal fusion results due to this issue. UMF-CMGR [<xref ref-type="bibr" rid="B31">31</xref>] and IMF [<xref ref-type="bibr" rid="B32">32</xref>] consider the effect of modality differences on registration results. They propose to convert visible images into pseudo-infrared images via an image generation network and then perform mono-modal registration between the pseudo-infrared and misaligned infrared images. However, the quality of the generated image has a direct impact on the final performance of these methods. Moreover, these methods treat registration and fusion as two independent tasks, failing to establish a unified framework where both tasks can benefit each other.</p>
<p>To address this, RFNet [<xref ref-type="bibr" rid="B33">33</xref>] and MURF [<xref ref-type="bibr" rid="B34">34</xref>] treat image fusion as a downstream task of registration and improve registration performance by enhancing the sparsity of the gradient in the fused result. However, to tackle the modality discrepancy issue during registration, both methods aim to transform the multimodal registration into a mono-modal one. Specifically, RFNet uses an image generation model to produce a pseudo-image with the same modality as the misaligned one before performing mono-modal registration, while MURF leverages contrastive learning to extract modality-invariant features from the input image pair for registration. Similarly, Super-Fusion [<xref ref-type="bibr" rid="B35">35</xref>] extracts modality-invariant features using shared-parameter encoders and consistency constraints on the fused result for registration.</p>
<p>Nevertheless, the information carried by modality-invariant features in infrared-visible pairs is often far less rich than the complementary information present in the image pair. As a result, it is difficult to achieve satisfactory cross-modal registration using only modality-invariant features. In addition, the above methods all follow a two-stage approach (registration &#x2b; fusion). This two-stage strategy greatly limits deployment in practical applications due to computational constraints. Although RFVIF [<xref ref-type="bibr" rid="B36">36</xref>], IVFWSR [<xref ref-type="bibr" rid="B37">37</xref>] and MulFS-CAP [<xref ref-type="bibr" rid="B38">38</xref>] attempt to achieve registration and fusion within a single-stage framework, the types of deformations they can handle remain limited. Unlike the methods mentioned above, our approach considers multiple challenges simultaneously: the impact of modality discrepancy on cross-modal registration, the deployment limitations of two-stage processing, and the feature requirements of downstream high-level vision tasks for both registration and fusion.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methods</title>
<sec id="s3-1">
<title>3.1 Overview</title>
<p>As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, the proposed method consists of three core components: feature extraction, feature alignment and fusion, and dual-task reconstruction. The feature extraction component is designed to obtain both modality-specific and modality-common features from the source images. The feature alignment and fusion component is used to predict a deformation field, which is then used to spatially align the infrared-specific and common features. These aligned features are then fused with the corresponding visible image&#x2019;s specific and common features. In the dual-task reconstruction stage, the fused features are fed into the object detection head and the image reconstruction head, respectively, to generate both the object detection result map and the fused image.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Overall framework of the proposed method. We use IR/VI-CFE and IR/VI-SFE to extract common and specific features from the infrared and visible images, respectively. To obtain the deformation field for spatial correction, the infrared/visible common features are fed into the registration module to predict the deformation field. This deformation field is then applied to the infrared common/specific features to correct spatial deformation. The corrected infrared features are concatenated with the visible features and then fed into the image reconstruction head and the object detection head, respectively, to generate the fused image and the object detection result.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g002.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Feature extraction</title>
<p>The main objective of feature extraction is to extract both the common and specific features of infrared and visible images, in order to facilitate subsequent cross-modal registration and feature fusion. This process consists of four modules: the IR-Specific Feature Extraction (IR-SFE) module, the VI-Specific Feature Extraction (VI-SFE) module, the IR-Common Feature Extraction (IR-CFE) module, and the VI-Common Feature Extraction (VI-CFE) module. Among them, the IR/VI-SFE modules are used to extract modality-specific features from the infrared/visible images, while the IR/VI-CFE modules are used to extract their common features. Assume that each sample in the training dataset contains three images: a pixel-wise strictly aligned infrared image <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, a visible image <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and a deformed infrared image <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. We feed <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> into the IR-CFE and IR-SFE, respectively, to obtain the infrared common feature <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the deformed infrared common feature <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, the infrared specific feature <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the deformed infrared specific feature <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. At the same time, we feed <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into the VI-CFE and VI-SFE to obtain the visible common feature <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the visible specific feature <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>In the cross-modal registration process, it is usually necessary to rely on the common information between cross-modal images to establish pixel-wise correspondences. To reduce the modality gap between infrared and visible images and thus establish more accurate pixel-wise correspondences, we introduce a modality consistency loss <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e1">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>Here, <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf17">
<mml:math id="m18">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the height, width, and number of channels of the feature maps, respectively, and <inline-formula id="inf18">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the l1-norm. In addition, considering that the goal of image fusion is to integrate as much complementary information as possible from cross-modal source images into a single image, we introduce the modality complementary information loss <inline-formula id="inf19">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to further enrich the complementary information from the source images in the fused image:<disp-formula id="e2">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
</sec>
<sec id="s3-3">
<title>3.3 Feature alignment and fusion</title>
<p>Feature alignment corrects the deformation in infrared features by predicting a deformation field, thereby achieving spatial alignment between infrared and visible features. This process is mainly implemented by the registration network. Subsequently, the aligned infrared features are fused with the visible features to obtain the fused features. As shown in <xref ref-type="fig" rid="F3">Figure 3</xref>, the registration network is composed of a Channel and Spatial Enhancement Block (CSEB) and a Multi-Scale Registration Block (MSRB). The CSEB is mainly used to enhance the information beneficial to registration at both the channel and spatial levels, thereby improving the accuracy of the predicted deformation field. The CSEB consists of six feature extraction layers and a Global Average Pooling (GAP) layer. Each feature extraction layer is composed of a convolutional layer with a kernel size of <inline-formula id="inf20">
<mml:math id="m22">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, stride 1, followed by Batch Normalization (BatchNorm) and a LeakyReLU activation function. The MSRB is used to predict the deformation field to correct the deformed infrared features and ensure spatial alignment between the infrared and visible features. The MSRB adopts a U-Net-like architecture.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Structure of the registration network. The registration network mainly consists of the Channel and Spatial Enhancement Block (CSEB) and the Multi-Scale Registration Block (MSRB).</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g003.tif"/>
</fig>
<p>We input the deformed infrared common feature <inline-formula id="inf21">
<mml:math id="m23">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and the visible common feature <inline-formula id="inf22">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into two CSEBs with unshared parameters, obtaining the enhanced features <inline-formula id="inf23">
<mml:math id="m25">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf24">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. Taking the enhancement process of <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as an example, <inline-formula id="inf26">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is fed into three feature extraction layers to generate the spatial enhancement weights <inline-formula id="inf27">
<mml:math id="m29">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. To enhance registration-relevant information at the spatial level, we perform element-wise multiplication between <inline-formula id="inf28">
<mml:math id="m30">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf29">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e3">
<mml:math id="m32">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>Here, <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the feature enhanced at the spatial level, and <inline-formula id="inf31">
<mml:math id="m34">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents the element-wise multiplication operation. We feed <inline-formula id="inf32">
<mml:math id="m35">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> into three feature extraction layers and a global average pooling (GAP) layer to obtain feature <inline-formula id="inf33">
<mml:math id="m36">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for channel-level enhancement. Then, <inline-formula id="inf34">
<mml:math id="m37">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is element-wise multiplied with <inline-formula id="inf35">
<mml:math id="m38">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> to produce the enhanced feature <inline-formula id="inf36">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which has been refined at both the spatial and channel levels:<disp-formula id="e4">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2299;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>Similarly, we obtain the deformed infrared common feature <inline-formula id="inf37">
<mml:math id="m41">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> enhanced at both the spatial and channel levels. We concatenate <inline-formula id="inf38">
<mml:math id="m42">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>and <inline-formula id="inf39">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> along the channel dimension and feed the resulting feature into the MSRB to predict the deformation field <inline-formula id="inf40">
<mml:math id="m44">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. To ensure the accuracy of the predicted deformation field, we introduce a registration loss <inline-formula id="inf41">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">reg</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e5">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">reg</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>Here, <inline-formula id="inf42">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the label of <inline-formula id="inf43">
<mml:math id="m48">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>We use <inline-formula id="inf44">
<mml:math id="m49">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to correct <inline-formula id="inf45">
<mml:math id="m50">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf46">
<mml:math id="m51">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> respectively, resulting in the corrected infrared common feature <inline-formula id="inf47">
<mml:math id="m52">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and infrared-specific feature <inline-formula id="inf48">
<mml:math id="m53">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e6">
<mml:math id="m54">
<mml:mrow>
<mml:mtable class="array">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mo>&#x25e6;</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mo>&#x25e6;</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>Here, <inline-formula id="inf49">
<mml:math id="m55">
<mml:mrow>
<mml:mo>&#x25e6;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the Warp operation, which resamples the deformed feature maps based on <inline-formula id="inf50">
<mml:math id="m56">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to correct the deformations within them. During the fusion process, to minimize information loss, we concatenate <inline-formula id="inf51">
<mml:math id="m57">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf52">
<mml:math id="m58">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf53">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf54">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> along the channel dimension to obtain the fused feature <inline-formula id="inf55">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e7">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>Here, <inline-formula id="inf56">
<mml:math id="m63">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents the operation of concatenation along the channel dimension.</p>
</sec>
<sec id="s3-4">
<title>3.4 Dual-task reconstruction</title>
<p>In the dual-task reconstruction, the fused feature is fed into both the object detection head and the image reconstruction head to respectively generate the object detection result map and the fused image. The dual-task reconstruction primarily consists of the object detection head and the image reconstruction head. We adopt YOLOv5 [<xref ref-type="bibr" rid="B39">39</xref>] as the object detection head. The image reconstruction head is composed of three feature extraction layers, where the LeakyReLU activation function in the final layer is replaced with a Tanh activation function. The fused feature <inline-formula id="inf57">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is input into both the object detection head and the image reconstruction head to obtain the object detection result map <inline-formula id="inf58">
<mml:math id="m65">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> and the fused image <inline-formula id="inf59">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. To ensure high-quality object detection results, we introduce the object detection loss <inline-formula id="inf60">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to constrain the network:<disp-formula id="e8">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">yolov5</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>Here, <inline-formula id="inf61">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">yolov5</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> refers to the loss function used during the training of YOLOv5. In addition, to encourage the fused image to retain as much shared and complementary information from both infrared and visible images as possible, we introduce luminance loss <inline-formula id="inf62">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and gradient loss <inline-formula id="inf63">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and construct the fusion loss <inline-formula id="inf64">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> accordingly:<disp-formula id="e9">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>Here, <inline-formula id="inf65">
<mml:math id="m74">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the balancing hyperparameter. The gradient loss <inline-formula id="inf66">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is defined as:<disp-formula id="e10">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>Here, <inline-formula id="inf67">
<mml:math id="m77">
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the Sobel operator. The luminance loss <inline-formula id="inf68">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is defined as:<disp-formula id="e11">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>Finally, we define the total loss <inline-formula id="inf69">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as follows:<disp-formula id="e12">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">reg</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>Here, <inline-formula id="inf70">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1,2,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the balancing hyperparameter.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Experiments</title>
<sec id="s4-1">
<title>4.1 Experimental setup</title>
<sec id="s4-1-1">
<title>4.1.1 Datasets and implementation details</title>
<sec id="s4-1-1-1">
<title>4.1.1.1 Datasets</title>
<p>Following standard experimental practices in the image fusion field [<xref ref-type="bibr" rid="B40">40</xref>&#x2013;<xref ref-type="bibr" rid="B43">43</xref>], we trained our model on 152 pairs of infrared and visible images with a resolution of <inline-formula id="inf71">
<mml:math id="m83">
<mml:mrow>
<mml:mn>512</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> from the RoadScene Xu et al. [<xref ref-type="bibr" rid="B44">44</xref>, <xref ref-type="bibr" rid="B45">45</xref>] dataset. For testing, we used 18 pairs of images from RoadScene and 17 pairs from <inline-formula id="inf72">
<mml:math id="m84">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD [<xref ref-type="bibr" rid="B21">21</xref>]. The misaligned infrared images were generated by randomly applying a combination of rigid and non-rigid deformations to the originally well-aligned infrared images. This type of mixed deformation is applied randomly to the original aligned images in each epoch to augment the training data.</p>
</sec>
<sec id="s4-1-1-2">
<title>4.1.1.2 Implementation details</title>
<p>The proposed method was implemented using the PyTorch framework and trained on a single NVIDIA GeForce RTX 3090 GPU. The model was trained for 150 epochs with a batch size of 8, a learning rate of 1e-3, and the Adam optimizer was used to update the model parameters. The four hyperparameters in the loss function were set to <inline-formula id="inf73">
<mml:math id="m85">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf74">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf75">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>,and <inline-formula id="inf76">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
</sec>
<sec id="s4-1-2">
<title>4.1.2 Evaluation metrics</title>
<p>We selected four commonly used image quality evaluation metrics to objectively assess the quality of the fusion results, including correlation coefficient <inline-formula id="inf77">
<mml:math id="m89">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B46">46</xref>], gradient-based fusion performance <inline-formula id="inf78">
<mml:math id="m90">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B47">47</xref>], Chen-Varshney metric <inline-formula id="inf79">
<mml:math id="m91">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B48">48</xref>], and structural similarity <inline-formula id="inf80">
<mml:math id="m92">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B49">49</xref>]. Metric <inline-formula id="inf81">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> evaluates the linear correlation between the fused image and the source images, reflecting their similarity. Metric <inline-formula id="inf82">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> assesses the amount of edge information transferred from the source images to the fused image. Metric <inline-formula id="inf83">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> takes into account both edge information and human visual perception. Metric <inline-formula id="inf84">
<mml:math id="m96">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> quantifies information loss and distortion in the fused image by comparing it with the source images. Among these metrics, a lower value of indicates better fusion quality, while higher values of the other metrics indicate better performance. In addition, we adopted metric <inline-formula id="inf85">
<mml:math id="m97">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="normal">50</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mn mathvariant="normal">90</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B50">50</xref>] as the evaluation metric for the object detection task, where a higher <inline-formula id="inf86">
<mml:math id="m98">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="normal">50</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mn mathvariant="normal">90</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> value indicates better detection performance.</p>
</sec>
</sec>
<sec id="s4-2">
<title>4.2 Comparison with state-of-the-art methods</title>
<p>In our experiments, we first compare the proposed method with two categories of fusion approaches for unaligned infrared and visible images based on their fusion results. We then compare the subsequent object detection results obtained using these two categories of methods. The first category involves registering the images to be fused, followed by image fusion and then object detection. We refer to this category as Registration &#x2b; Fusion &#x2b; Object Detection. The second category performs joint training of registration and fusion to directly handle unaligned images, followed by object detection. We refer to this as Joint Registration and Fusion &#x2b; Object Detection.</p>
<sec id="s4-2-1">
<title>4.2.1 Comparison with registration &#x2b; fusion &#x2b; object detection methods</title>
<p>For the Registration &#x2b; Fusion &#x2b; Object Detection methods, we follow the standard processing pipeline used in prior work. We first adopt the high-performing registration method CrossRAFT [<xref ref-type="bibr" rid="B51">51</xref>] to align the images to be fused. Then, we apply four advanced infrared and visible image fusion methods to the aligned results, including DATFuse [<xref ref-type="bibr" rid="B52">52</xref>], TarDAL [<xref ref-type="bibr" rid="B21">21</xref>], YDTR [<xref ref-type="bibr" rid="B53">53</xref>], and EMMA [<xref ref-type="bibr" rid="B54">54</xref>]. <xref ref-type="fig" rid="F4">Figure 4</xref> shows the visual results of different methods. As seen from the fusion results, our proposed method not only demonstrates stronger capability in preserving structures and textures but also effectively avoids distortions and artifacts caused by feature misalignment. In addition, we performed objective evaluations of the results from different methods. As shown in <xref ref-type="table" rid="T1">Table 1</xref>, our method achieves the best performance across all four evaluation metrics.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Qualitative comparison of fusion results between the Registration &#x2b; Fusion &#x2b; Object Detection methods and the proposed method. The first two columns show the unaligned source images as input. The grid in the first column illustrates the deformation present in the image. Columns 3 to 7 present the fusion results obtained by different methods.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g004.tif"/>
</fig>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Quantitative comparison of fusion results between the Registration &#x2b; Fusion &#x2b; Object Detection methods and the proposed method.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf87">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf88">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf89">
<mml:math id="m101">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf90">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">DATFuse</td>
<td align="center">0.8303</td>
<td align="center">0.3246</td>
<td align="center">1425.2631</td>
<td align="center">1.2189</td>
</tr>
<tr>
<td align="center">TarDAL</td>
<td align="center">0.8317</td>
<td align="center">0.3313</td>
<td align="center">1396.1484</td>
<td align="center">1.2205</td>
</tr>
<tr>
<td align="center">YDTR</td>
<td align="center">0.8246</td>
<td align="center">0.3179</td>
<td align="center">1383.2556</td>
<td align="center">1.2133</td>
</tr>
<tr>
<td align="center">EMMA</td>
<td align="center">0.8255</td>
<td align="center">0.3341</td>
<td align="center">1399.4075</td>
<td align="center">1.2236</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>0.8325</bold>
</td>
<td align="center">
<bold>0.3420</bold>
</td>
<td align="center">
<bold>1375.5238</bold>
</td>
<td align="center">
<bold>1.2271</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bolded values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4-2-2">
<title>4.2.2 Comparison with joint registration and fusion &#x2b; object detection methods</title>
<p>In recent years, joint registration and fusion methods have attracted significant attention. To demonstrate the superiority of our approach over these methods, we compared its performance with four joint registration and fusion methods: IMF, IVFWSR, MURF, and SuperFusion. <xref ref-type="fig" rid="F5">Figure 5</xref> presents a qualitative comparison of the fusion results produced by different methods. It can be observed that our method exhibits clear advantages in terms of feature alignment, contrast preservation, and detail retention. In addition, we conducted quantitative experiments to visually compare the performance differences. As shown in <xref ref-type="table" rid="T2">Table 2</xref>, our method achieves the best performance across all four evaluation metrics.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Qualitative comparison of fusion results between the Joint Registration and Fusion &#x2b; Object Detection methods and the proposed method. The first two columns show the unaligned source images as input. The grid in the first column illustrates the deformation in the image. Columns 3 to 7 display the fusion results produced by different methods.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g005.tif"/>
</fig>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Quantitative comparison of fusion results between the Joint Registration and Fusion &#x2b; Object Detection methods and the proposed method.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf91">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf92">
<mml:math id="m104">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf93">
<mml:math id="m105">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf94">
<mml:math id="m106">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">IMF</td>
<td align="center">0.8221</td>
<td align="center">0.3119</td>
<td align="center">1477.6932</td>
<td align="center">1.2058</td>
</tr>
<tr>
<td align="center">IVFWSR</td>
<td align="center">0.8269</td>
<td align="center">0.3208</td>
<td align="center">1586.8251</td>
<td align="center">1.2115</td>
</tr>
<tr>
<td align="center">MURF</td>
<td align="center">0.8315</td>
<td align="center">0.3254</td>
<td align="center">1456.3259</td>
<td align="center">1.2140</td>
</tr>
<tr>
<td align="center">SuperFusion</td>
<td align="center">0.8320</td>
<td align="center">0.3396</td>
<td align="center">1399.4521</td>
<td align="center">1.2207</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>0.8325</bold>
</td>
<td align="center">
<bold>0.3420</bold>
</td>
<td align="center">
<bold>1375.5238</bold>
</td>
<td align="center">
<bold>1.2271</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bolded values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4-2-3">
<title>4.2.3 Performance evaluation on infrared and visible image object detection</title>
<p>We evaluated the object detection performance of the two aforementioned categories of methods, as well as the proposed method, on the <inline-formula id="inf95">
<mml:math id="m107">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD dataset. <xref ref-type="fig" rid="F6">Figure 6</xref> shows the visualized results of object detection. In comparison, our proposed method achieves superior performance. <xref ref-type="table" rid="T3">Table 3</xref> presents the quantitative results. The fused outputs generated by our method help the detection network achieve the highest object detection accuracy. This further demonstrates the superior fusion capability of our approach for object detection tasks.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Visualization of object detection results using different fusion methods on the <inline-formula id="inf96">
<mml:math id="m108">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD dataset.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g006.tif"/>
</fig>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Quantitative object detection results of different fusion methods on the <inline-formula id="inf97">
<mml:math id="m109">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf98">
<mml:math id="m110">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="normal">50</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mn mathvariant="normal">90</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">DATFuse</td>
<td align="center">53.10</td>
</tr>
<tr>
<td align="center">TarDAL</td>
<td align="center">53.20</td>
</tr>
<tr>
<td align="center">YDTR</td>
<td align="center">53.80</td>
</tr>
<tr>
<td align="center">EMMA</td>
<td align="center">54.20</td>
</tr>
<tr>
<td align="center">IMF</td>
<td align="center">52.40</td>
</tr>
<tr>
<td align="center">IVFWSR</td>
<td align="center">52.60</td>
</tr>
<tr>
<td align="center">MURF</td>
<td align="center">52.20</td>
</tr>
<tr>
<td align="center">SuperFusion</td>
<td align="center">53.80</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>54.50</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bolded values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s4-3">
<title>4.3 Ablation study</title>
<p>The core of the proposed method lies in the losses designed to eliminate modality differences, namely, losses <inline-formula id="inf99">
<mml:math id="m111">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf100">
<mml:math id="m112">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. In this section, we conduct ablation studies on these key components to verify their effectiveness. All experiments are conducted on the <inline-formula id="inf101">
<mml:math id="m113">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD dataset. From the ablation results, it can be observed that removing losses <inline-formula id="inf102">
<mml:math id="m114">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf103">
<mml:math id="m115">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> leads to a decline in the model&#x2019;s ability to correct local deformations, as shown in <xref ref-type="fig" rid="F7">Figure 7</xref>. In addition, when the shared information is excluded during fusion and only complementary information is used for concatenation, the visual quality of the fused image does not deteriorate significantly, but the objective evaluation results in <xref ref-type="table" rid="T4">Table 4</xref> show a noticeable drop in performance.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Ablation study of the core designs.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g007.tif"/>
</fig>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Quantitative results of the ablation study on the core designs.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf104">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf105">
<mml:math id="m117">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf106">
<mml:math id="m118">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf107">
<mml:math id="m119">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">w/o <inline-formula id="inf108">
<mml:math id="m120">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.8304</td>
<td align="center">0.3469</td>
<td align="center">1339.9062</td>
<td align="center">1.2204</td>
</tr>
<tr>
<td align="center">w/o <inline-formula id="inf109">
<mml:math id="m121">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.8313</td>
<td align="center">0.3439</td>
<td align="center">1336.8814</td>
<td align="center">1.2256</td>
</tr>
<tr>
<td align="center">w/o Concat <inline-formula id="inf110">
<mml:math id="m122">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf111">
<mml:math id="m123">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.8274</td>
<td align="center">0.3451</td>
<td align="center">1369.4537</td>
<td align="center">1.2114</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>0.8325</bold>
</td>
<td align="center">
<bold>0.3420</bold>
</td>
<td align="center">
<bold>1375.5238</bold>
</td>
<td align="center">
<bold>1.2271</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bolded values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4-4">
<title>4.4 Analysis of hyperparameters</title>
<p>In our proposed method, four main hyperparameters are defined: <inline-formula id="inf112">
<mml:math id="m124">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf113">
<mml:math id="m125">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf114">
<mml:math id="m126">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which balances different losses, i.e., <inline-formula id="inf115">
<mml:math id="m127">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">reg</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf116">
<mml:math id="m128">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf117">
<mml:math id="m129">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf118">
<mml:math id="m130">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, which balances luminance loss <inline-formula id="inf119">
<mml:math id="m131">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and gradient loss <inline-formula id="inf120">
<mml:math id="m132">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. During model training, <inline-formula id="inf121">
<mml:math id="m133">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf122">
<mml:math id="m134">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf123">
<mml:math id="m135">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf124">
<mml:math id="m136">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are set to 10, 5, 5, two respectively.</p>
<p>Next, we analyze the impact of variations in these hyperparameters on model performance. To analyze the impact of <inline-formula id="inf125">
<mml:math id="m137">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf126">
<mml:math id="m138">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf127">
<mml:math id="m139">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> on fusion performance, we perform a search over <inline-formula id="inf128">
<mml:math id="m140">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf129">
<mml:math id="m141">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf130">
<mml:math id="m142">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> values in the ranges of 1&#x2013;20, 1 to 10, and 1 to 10. The quantitative evaluation results for both fusion and downstream object detection are presented in <xref ref-type="table" rid="T5">Table 5</xref>. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, the model achieves optimal performance on fusion when <inline-formula id="inf131">
<mml:math id="m143">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf132">
<mml:math id="m144">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf133">
<mml:math id="m145">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Quantitative analysis results of the hyperparameter study.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">
<inline-formula id="inf134">
<mml:math id="m146">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf135">
<mml:math id="m147">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf136">
<mml:math id="m148">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf137">
<mml:math id="m149">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf138">
<mml:math id="m150">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf139">
<mml:math id="m151">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf140">
<mml:math id="m152">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf141">
<mml:math id="m153">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">2</td>
<td align="center">10</td>
<td align="center">5</td>
<td align="center">1</td>
<td align="center">0.8235</td>
<td align="center">0.3352</td>
<td align="center">1450.3498</td>
<td align="center">1.2222</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">10</td>
<td align="center">5</td>
<td align="center">10</td>
<td align="center">0.8198</td>
<td align="center">0.3389</td>
<td align="center">1683.8772</td>
<td align="center">1.2195</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">10</td>
<td align="center">1</td>
<td align="center">5</td>
<td align="center">0.8123</td>
<td align="center">0.3321</td>
<td align="center">1502.6641</td>
<td align="center">1.2088</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">10</td>
<td align="center">10</td>
<td align="center">5</td>
<td align="center">0.8260</td>
<td align="center">0.3334</td>
<td align="center">1465.2293</td>
<td align="center">1.2247</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">1</td>
<td align="center">5</td>
<td align="center">5</td>
<td align="center">0.8011</td>
<td align="center">0.3195</td>
<td align="center">1450.3288</td>
<td align="center">1.1954</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">20</td>
<td align="center">5</td>
<td align="center">5</td>
<td align="center">0.8059</td>
<td align="center">0.3248</td>
<td align="center">1529.1245</td>
<td align="center">1.1996</td>
</tr>
<tr>
<td align="center">1</td>
<td align="center">10</td>
<td align="center">5</td>
<td align="center">5</td>
<td align="center">0.8144</td>
<td align="center">0.3340</td>
<td align="center">1499.3888</td>
<td align="center">1.2111</td>
</tr>
<tr>
<td align="center">5</td>
<td align="center">10</td>
<td align="center">5</td>
<td align="center">5</td>
<td align="center">0.8080</td>
<td align="center">0.3302</td>
<td align="center">1775.1124</td>
<td align="center">1.2020</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">10</td>
<td align="center">5</td>
<td align="center">5</td>
<td align="center">
<bold>0.8325</bold>
</td>
<td align="center">
<bold>0.3420</bold>
</td>
<td align="center">
<bold>1375.5238</bold>
</td>
<td align="center">
<bold>1.2271</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bolded values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To verify the effectiveness of the hyperparameter <inline-formula id="inf142">
<mml:math id="m154">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we fix <inline-formula id="inf143">
<mml:math id="m155">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf144">
<mml:math id="m156">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf145">
<mml:math id="m157">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to 10, 5, 5 and analyze the model performance as <inline-formula id="inf146">
<mml:math id="m158">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> varies from 1 to 5. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, the model achieves the best fusion performance when <inline-formula id="inf147">
<mml:math id="m159">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is set to 2. Therefore, we set the hyperparameter <inline-formula id="inf148">
<mml:math id="m160">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to 2.</p>
</sec>
<sec id="s4-5">
<title>4.5 Analysis of computational complexity</title>
<p>As shown in <xref ref-type="table" rid="T6">Table 6</xref>, a complexity evaluation is introduced to evaluate the efficiency of our method from three aspects, i.e., FLOPs, training parameters and runtime. Wherein, for FLOPs calculation, the size of the input images is standardized to <inline-formula id="inf149">
<mml:math id="m161">
<mml:mrow>
<mml:mn>512</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels. The inference time is calculated as the average time taken to process 18 scene images from RoadScene&#x2019;s test dataset. From <xref ref-type="table" rid="T6">Table 6</xref>, our model performs the best in FLOPs, implying that our method has fast calculation speed and is application-friendly. The average inference time for our model to fuse two source images is 0.40 s, only a bit longer than the SOTA method, demonstrating that our model&#x2019;s inference speed is relatively fast and acceptable. Besides, the parameter size of our model is only 0.97M, which can be easily deployed in practical applications. This indicates the efficiency of our method, which can serve practical vision tasks well with better visual performance.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Computational efficiency comparison of four SOTA Joint Registration and Fusion methods, the value is tested on GPU.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">FLOPs(G)</th>
<th align="center">Size(M)</th>
<th align="center">Time(s)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">IMF</td>
<td align="center">1724.08</td>
<td align="center">13.30</td>
<td align="center">0.82</td>
</tr>
<tr>
<td align="center">IVFWSR</td>
<td align="center">859.43</td>
<td align="center">14.09</td>
<td align="center">0.33</td>
</tr>
<tr>
<td align="center">MURF</td>
<td align="center">120.72</td>
<td align="center">1.76</td>
<td align="center">1.18</td>
</tr>
<tr>
<td align="center">SuperFusion</td>
<td align="center">65.43</td>
<td align="center">
<bold>0.14</bold>
</td>
<td align="center">
<bold>0.27</bold>
</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>60.12</bold>
</td>
<td align="center">0.97</td>
<td align="center">0.40</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bolded values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4-6">
<title>4.6 Analysis of generalization ability</title>
<p>To validate the generalization ability of our method, we conduct experiments under other scenarios. Fusion results are shown in <xref ref-type="fig" rid="F8">Figure 8</xref>. From the qualitative results we can see that our proposed model performs perfectly under other scenarios.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Fusion results of our method on different scenarios.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g008.tif"/>
</fig>
</sec>
<sec id="s4-7">
<title>4.7 Analysis of limitation</title>
<p>The proposed method enables mutual enhancement between infrared-visible image fusion and object detection, specifically designed to handle misaligned source images, achieving better experimental results compared to other methods. However, our approach still has certain limitations. Specifically, since our model is trained on the generated unaligned dataset, where the deformations in real-world images cannot be fully included, failure cases appear under real-world scenarios. As shown in <xref ref-type="fig" rid="F9">Figure 9</xref>, our method fails to handle deformations under real-world scenarios. Improving the robustness of our method is vital for future research.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Failure cases of our method on the real-world dataset CVC-14.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g009.tif"/>
</fig>
</sec>
<sec id="s4-8">
<title>4.8 Further discussion</title>
<p>To validate the effectiveness of the proposed method in the field of medical imaging, we conduct a comparative study on the publicly available BraTS2020 Menze et al. [<xref ref-type="bibr" rid="B55">55</xref>] dataset. Specifically, we first employ the state-of-the-art medical image registration method CorrMLP Meng et al. [<xref ref-type="bibr" rid="B56">56</xref>] to align the deformed MRI-T2 images to the reference MRI-T1 images, and subsequently apply several advanced fusion methods (including MATR Tang et al. [<xref ref-type="bibr" rid="B57">57</xref>], ALMFnet Mu et al. [<xref ref-type="bibr" rid="B58">58</xref>], EMMA Zhao et al. [<xref ref-type="bibr" rid="B54">54</xref>], BSAFus Li et al. [<xref ref-type="bibr" rid="B47">47</xref>], and RMRFus Zhang et al. [<xref ref-type="bibr" rid="B59">59</xref>]) for image fusion. As shown in <xref ref-type="fig" rid="F10">Figure 10</xref>, the fusion images generated by the proposed method exhibit superior image quality and effectively correct artifacts and spatial deformations. In contrast, existing &#x201d;registration &#x2b; fusion&#x201d; methods often introduce noticeable artifacts when handling unregistered medical images, significantly degrading the visual quality of the fused images. Furthermore, as reported in <xref ref-type="table" rid="T7">Table 7</xref>, the quantitative analysis results further demonstrate the significant advantages of the proposed method in terms of fusion performance.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Visual comparison on the BraTS2020 dataset.</p>
</caption>
<graphic xlink:href="fphy-13-1599968-g010.tif"/>
</fig>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>Quantitative analysis results on the BraTS2020 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf150">
<mml:math id="m162">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf151">
<mml:math id="m163">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf152">
<mml:math id="m164">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf153">
<mml:math id="m165">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">MATR</td>
<td align="center">0.7889</td>
<td align="center">0.2901</td>
<td align="center">1345.4510</td>
<td align="center">1.2299</td>
</tr>
<tr>
<td align="center">ALMFnet</td>
<td align="center">0.7749</td>
<td align="center">0.2888</td>
<td align="center">1606.5911</td>
<td align="center">1.2155</td>
</tr>
<tr>
<td align="center">EMMA</td>
<td align="center">0.7906</td>
<td align="center">0.2853</td>
<td align="center">1568.7139</td>
<td align="center">1.2220</td>
</tr>
<tr>
<td align="center">BSAFus</td>
<td align="center">0.7812</td>
<td align="center">0.3001</td>
<td align="center">1436.1287</td>
<td align="center">1.2318</td>
</tr>
<tr>
<td align="center">RMRFus</td>
<td align="center">0.7784</td>
<td align="center">0.2992</td>
<td align="center">1409.9831</td>
<td align="center">1.2007</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>0.7934</bold>
</td>
<td align="center">
<bold>0.3063</bold>
</td>
<td align="center">
<bold>1399.5234</bold>
</td>
<td align="center">
<bold>1.2454</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bolded values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>This paper proposes a mutual promotion algorithm for infrared and visible image fusion and object detection, tailored for unaligned image scenarios. Considering the significant modality differences between infrared and visible images, we design specific loss functions to reduce such differences, thereby easing the difficulty of cross-modality image registration and improving its accuracy. In addition, we adopt a mutually beneficial learning strategy that enables the fusion task and the downstream object detection task to enhance each other, leading to improved quality in both the fused images and detection results. Extensive qualitative and quantitative experiments demonstrate the superiority of our method over existing state-of-the-art approaches. In addition, our approach can be used with other radiation frequencies where different modalities require image fusion like, for example, radio-frequency, x- and gamma rays used in medical imaging.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>DH: Conceptualization, Methodology, Writing &#x2013; review and editing, Writing &#x2013; original draft, Investigation. KW: Writing &#x2013; review and editing, Project administration, Data curation. CZ: Validation, Writing &#x2013; review and editing, Formal Analysis. ZL: Methodology, Supervision, Writing &#x2013; original draft, Funding acquisition, Writing &#x2013; review and editing. YC: Formal Analysis, Visualization, Project administration, Writing &#x2013; review and editing. SD: Resources, Data curation, Validation, Writing &#x2013; review and editing. CK: Resources, Writing &#x2013; review and editing, Formal Analysis.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>Authors DH, KW, CZ, ZL, YC, SD, and CK were employed by Yunnan Power Grid Co., Ltd.</p>
<p>The authors declare that this study received funding from the Science and Technology Project of China Southern Power Grid Co., Ltd. (No. YNKJXM20240052). The funder had the following involvement in the study: study design, collection, analysis, interpretation of data, the writing of this article, and the decision to submit it for publication.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that Generative AI was used in the creation of this manuscript. AI was only used to polish the paper.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Guan</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Qiao</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Pedestrian detection with unsupervised multispectral feature learning using deep neural networks</article-title>. <source>information fusion</source> (<year>2019</year>) <volume>46</volume>:<fpage>206</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2018.06.005</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>Cross-modal ranking with soft consistency and noisy labels for robust rgb-t tracking</article-title>. In: <source>Proceedings of the European conference on computer vision (ECCV)</source> (<year>2018</year>). p. <fpage>808</fpage>&#x2013;<lpage>23</lpage>.</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>K</given-names>
</name>
<etal/>
</person-group> <article-title>Learning modal-invariant and temporal-memory for video-based visible-infrared person re-identification</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2022</year>). p. <fpage>20973</fpage>&#x2013;<lpage>82</lpage>.</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ha</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Watanabe</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Karasawa</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Ushiku</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Harada</surname>
<given-names>T</given-names>
</name>
</person-group>. <article-title>Mfnet: towards real-time semantic segmentation for autonomous vehicles with multi-spectral scenes</article-title>. In: <source>2017 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source>. <publisher-name>IEEE</publisher-name> (<year>2017</year>). p. <fpage>5108</fpage>&#x2013;<lpage>15</lpage>.</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Pixel-level image fusion: a survey of the state of the art</article-title>. <source>information Fusion</source> (<year>2017</year>) <volume>33</volume>:<fpage>100</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2016.05.004</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>W</given-names>
</name>
</person-group>. <article-title>Fast infrared and visible image fusion with structural decomposition</article-title>. <source>Knowledge-Based Syst</source> (<year>2020</year>) <volume>204</volume>:<fpage>106182</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2020.106182</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Infrared and visible image fusion scheme based on nsct and low-level visual features</article-title>. <source>Infrared Phys and Technology</source> (<year>2016</year>) <volume>76</volume>:<fpage>174</fpage>&#x2013;<lpage>84</lpage>. <pub-id pub-id-type="doi">10.1016/j.infrared.2016.02.005</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Blum</surname>
<given-names>RS</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Sparse representation based multi-sensor image fusion for multi-focus and multi-modality images: a review</article-title>. <source>Inf fusion</source> (<year>2018</year>) <volume>40</volume>:<fpage>57</fpage>&#x2013;<lpage>75</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2017.05.006</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>A unified framework for damaged image fusion and completion based on low-rank and sparse decomposition</article-title>. <source>Signal Processing: Image Commun</source> (<year>2021</year>) <volume>98</volume>:<fpage>116400</fpage>. <pub-id pub-id-type="doi">10.1016/j.image.2021.116400</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Discriminative dictionary learning-based multiple component decomposition for detail-preserving noisy image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2020</year>) <volume>69</volume>:<fpage>1082</fpage>&#x2013;<lpage>102</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2019.2912239</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Heterogeneous knowledge distillation for simultaneous infrared-visible image fusion and super-resolution</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2022</year>) <volume>71</volume>:<fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2022.3149101</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Analysis-synthesis dictionary pair learning and patch saliency measure for image fusion</article-title>. <source>Signal Process.</source> (<year>2020</year>) <volume>167</volume>:<fpage>107327</fpage>. <pub-id pub-id-type="doi">10.1016/j.sigpro.2019.107327</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X-J</given-names>
</name>
<name>
<surname>Kittler</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rfn-nest: an end-to-end residual fusion network for infrared and visible images</article-title>. <source>Inf Fusion</source> (<year>2021</year>) <volume>73</volume>:<fpage>72</fpage>&#x2013;<lpage>86</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2021.02.023</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Densefuse: a fusion approach to infrared and visible images</article-title>. <source>IEEE Trans Image Process</source> (<year>2019</year>) <volume>28</volume>:<fpage>2614</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2018.2887342</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>ZJ</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Vdmufusion: a versatile diffusion model-based unsupervised framework for image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2025</year>) <volume>34</volume>:<fpage>441</fpage>&#x2013;<lpage>54</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2024.3512365</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Stdfusionnet: an infrared and visible image fusion network based on salient target detection</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2021</year>) <volume>70</volume>:<fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/TIM.2021.3075747</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Du</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Chitnet: a complementary to harmonious information transfer network for infrared and visible image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2025</year>) <volume>74</volume>:<fpage>1</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1109/TIM.2025.3527523</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Mgfusion: a multimodal large language model-guided information perception for infrared and visible image fusion</article-title>. <source>Front Neurorobotics</source> (<year>2024</year>) <volume>18</volume>:<fpage>1521603</fpage>. <pub-id pub-id-type="doi">10.3389/fnbot.2024.1521603</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>Infrared and visible image fusion via detail preserving adversarial learning</article-title>. <source>Inf Fusion</source> (<year>2020</year>) <volume>54</volume>:<fpage>85</fpage>&#x2013;<lpage>98</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2019.07.005</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Ddcgan: a dual-discriminator conditional generative adversarial network for multi-resolution image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2020</year>) <volume>29</volume>:<fpage>4980</fpage>&#x2013;<lpage>95</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2020.2977573</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>W</given-names>
</name>
<etal/>
</person-group> <article-title>Target-aware dual adversarial learning and a multi-scenario multi-modality benchmark to fuse infrared and visible for object detection</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2022</year>). p. <fpage>5802</fpage>&#x2013;<lpage>11</lpage>.</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Image fusion in the loop of high-level vision tasks: a semantic-aware real-time infrared and visible image fusion network</article-title>. <source>Inf Fusion</source> (<year>2022</year>) <volume>82</volume>:<fpage>28</fpage>&#x2013;<lpage>42</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2021.12.004</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Detfusion: a detection-driven infrared and visible image fusion network</article-title>. In: <source>Proceedings of the 30th ACM international conference on multimedia</source> (<year>2022</year>). p. <fpage>4003</fpage>&#x2013;<lpage>11</lpage>.</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rethinking the necessity of image fusion in high-level vision tasks: a practical infrared and visible image fusion network based on progressive semantic injection and scene fidelity</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>99</volume>:<fpage>101870</fpage>.</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>W</given-names>
</name>
<etal/>
</person-group> <article-title>Multi-interactive feature learning and a full-time multi-modality benchmark for image fusion and segmentation</article-title>. In: <source>2023 IEEE/CVF international conference on computer vision (ICCV)</source> (<year>2023</year>). p. <fpage>8081</fpage>&#x2013;<lpage>90</lpage>.</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Mrfs: mutually reinforcing image fusion and segmentation</article-title>. In: <source>2024 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2024</year>). p. <fpage>26964</fpage>&#x2013;<lpage>73</lpage>.</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Instruction-driven fusion of infrared-visible images: tailoring for diverse downstream tasks</article-title>. <source>arXiv preprint arXiv:2411.09387</source> (<year>2024</year>).</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>F</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Metafusion: infrared and visible image fusion via meta-feature embedding from object detection</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2023</year>). p. <fpage>13955</fpage>&#x2013;<lpage>65</lpage>.</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rethinking the necessity of image fusion in high-level vision tasks: a practical infrared and visible image fusion network based on progressive semantic injection and scene fidelity</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>99</volume>:<fpage>101870</fpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2023.101870</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Reconet: recurrent correction network for fast and efficient multi-modality image fusion</article-title>. In: <source>European conference on computer vision (ECCV2022)</source> (<year>2022</year>). p. <fpage>539</fpage>&#x2013;<lpage>55</lpage>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Unsupervised misaligned infrared and visible image fusion via cross-modality image generation and registration</article-title>. In: <source>International joint conference on artificial intelligence (IJCAI)</source> (<year>2022</year>).</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Improving misaligned multi-modality image fusion with one-stage progressive dense registration</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2024</year>) <volume>34</volume>:<fpage>10944</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2024.3412743</pub-id>
</citation>
</ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W</given-names>
</name>
</person-group>. <article-title>Rfnet: unsupervised network for mutually reinforcing multi-modal image registration and fusion</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2022</year>). p. <fpage>19679</fpage>&#x2013;<lpage>88</lpage>.</citation>
</ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Murf: mutually reinforcing multi-modal image registration and fusion</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2023</year>) <volume>45</volume>:<fpage>12148</fpage>&#x2013;<lpage>66</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2023.3283682</pub-id>
</citation>
</ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Superfusion: a versatile image registration and fusion network with semantic awareness</article-title>. <source>IEEE/CAA J Automatica Sinica</source> (<year>2022</year>) <volume>9</volume>:<fpage>2121</fpage>&#x2013;<lpage>37</lpage>. <pub-id pub-id-type="doi">10.1109/jas.2022.106082</pub-id>
</citation>
</ref>
<ref id="B36">
<label>36.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Feature dynamic alignment and refinement for infrared&#x2013;visible image fusion: translation robust fusion</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>95</volume>:<fpage>26</fpage>&#x2013;<lpage>41</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2023.02.011</pub-id>
</citation>
</ref>
<ref id="B37">
<label>37.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>A deep learning framework for infrared and visible image fusion without strict registration</article-title>. <source>Int J Computer Vis</source> (<year>2023</year>) <volume>132</volume>:<fpage>1625</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-023-01948-x</pub-id>
</citation>
</ref>
<ref id="B38">
<label>38.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Mulfs-cap: multimodal fusion-supervised cross-modality alignment perception for unregistered infrared-visible image fusion</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2025</year>) <volume>47</volume>:<fpage>3673</fpage>&#x2013;<lpage>90</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2025.3535617</pub-id>
</citation>
</ref>
<ref id="B39">
<label>39.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Divvala</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>You only look once: unified, real-time object detection</article-title>. In: <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source> (<year>2016</year>). p. <fpage>779</fpage>&#x2013;<lpage>88</lpage>.</citation>
</ref>
<ref id="B40">
<label>40.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Structure-embedded ghosting artifact suppression network for high dynamic range image reconstruction</article-title>. <source>Knowledge-Based Syst</source> (<year>2023</year>) <volume>263</volume>:<fpage>110278</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2023.110278</pub-id>
</citation>
</ref>
<ref id="B41">
<label>41.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Single-image hdr reconstruction assisted ghost suppression and detail preservation network for multi-exposure hdr imaging</article-title>. <source>IEEE Trans Comput Imaging</source> (<year>2024</year>) <volume>10</volume>:<fpage>429</fpage>&#x2013;<lpage>45</lpage>. <pub-id pub-id-type="doi">10.1109/tci.2024.3369396</pub-id>
</citation>
</ref>
<ref id="B42">
<label>42.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Dcpnet: a dual-task collaborative promotion network for pansharpening</article-title>. <source>IEEE Trans Geosci Remote Sensing</source> (<year>2024</year>) <volume>62</volume>:<fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2024.3377635</pub-id>
</citation>
</ref>
<ref id="B43">
<label>43.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>ZJ</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Mm-net: a mixformer-based multi-scale network for anatomical and functional image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2024</year>) <volume>33</volume>:<fpage>2197</fpage>&#x2013;<lpage>212</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2024.3374072</pub-id>
</citation>
</ref>
<ref id="B44">
<label>44.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ling</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>U2fusion: a unified unsupervised image fusion network</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2022</year>) <volume>44</volume>:<fpage>502</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2020.3012548</pub-id>
</citation>
</ref>
<ref id="B45">
<label>45.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Fusiondn: a unified densely connected network for image fusion</article-title>. In: <source>In proceedings of the thirty-fourth AAAI Conference on artificial intelligence</source> (<year>2020</year>).</citation>
</ref>
<ref id="B46">
<label>46.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
</person-group>. <article-title>Infrared and visible image fusion methods and applications: a survey</article-title>. <source>Inf Fusion</source> (<year>2019</year>) <volume>45</volume>:<fpage>153</fpage>&#x2013;<lpage>78</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2018.02.004</pub-id>
</citation>
</ref>
<ref id="B47">
<label>47.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Rethinking the effectiveness of objective evaluation metrics in multi-focus image fusion: a statistic-based approach</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2024</year>) <volume>46</volume>:<fpage>5806</fpage>&#x2013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2024.3367905</pub-id>
</citation>
</ref>
<ref id="B48">
<label>48.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Varshney</surname>
<given-names>PK</given-names>
</name>
</person-group>. <article-title>A human perception inspired quality metric for image fusion based on regional information</article-title>. <source>Inf Fusion</source> (<year>2007</year>) <volume>8</volume>:<fpage>193</fpage>&#x2013;<lpage>207</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2005.10.001</pub-id>
</citation>
</ref>
<ref id="B49">
<label>49.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Bovik</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Sheikh</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Simoncelli</surname>
<given-names>E</given-names>
</name>
</person-group>. <article-title>Image quality assessment: from error visibility to structural similarity</article-title>. <source>IEEE Trans Image Process</source> (<year>2004</year>) <volume>13</volume>:<fpage>600</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2003.819861</pub-id>
</citation>
</ref>
<ref id="B50">
<label>50.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Todorovic</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Destr: object detection with split transformer</article-title>. In: <source>2022 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2022</year>). p. <fpage>9367</fpage>&#x2013;<lpage>76</lpage>.</citation>
</ref>
<ref id="B51">
<label>51.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>B</given-names>
</name>
</person-group>. <article-title>Promoting single-modal optical flow network for diverse cross-modal flow estimation</article-title>. <source>Proc AAAI Conf Artif Intelligence (Aaai)</source> (<year>2022</year>) <volume>36</volume>:<fpage>3562</fpage>&#x2013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v36i3.20268</pub-id>
</citation>
</ref>
<ref id="B52">
<label>52.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>He</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Si</surname>
<given-names>T</given-names>
</name>
</person-group>. <article-title>Datfuse: infrared and visible image fusion via dual attention transformer</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2023</year>) <volume>33</volume>:<fpage>3159</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2023.3234340</pub-id>
</citation>
</ref>
<ref id="B53">
<label>53.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>He</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Ydtr: infrared and visible image fusion via y-shape dynamic transformer</article-title>. <source>IEEE Trans Multimedia</source> (<year>2023</year>) <volume>25</volume>:<fpage>5413</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1109/tmm.2022.3192661</pub-id>
</citation>
</ref>
<ref id="B54">
<label>54.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Equivariant multi-modality image fusion</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2024</year>).</citation>
</ref>
<ref id="B55">
<label>55.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Menze</surname>
<given-names>BH</given-names>
</name>
<name>
<surname>Jakab</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Bauer</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Kalpathy-Cramer</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Farahani</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Kirby</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>The multimodal brain tumor image segmentation benchmark (brats)</article-title>. <source>IEEE Trans Med Imaging</source> (<year>2015</year>) <volume>34</volume>:<fpage>1993</fpage>&#x2013;<lpage>2024</lpage>. <pub-id pub-id-type="doi">10.1109/tmi.2014.2377694</pub-id>
</citation>
</ref>
<ref id="B56">
<label>56.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Meng</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Bi</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Correlation-aware coarse-to-fine mlps for deformable medical image registration</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<year>2024</year>). p. <fpage>9645</fpage>&#x2013;<lpage>54</lpage>.</citation>
</ref>
<ref id="B57">
<label>57.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>He</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Matr: multimodal medical image fusion via multiscale adaptive transformer</article-title>. <source>IEEE Trans Image Process</source> (<year>2022</year>) <volume>31</volume>:<fpage>5134</fpage>&#x2013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2022.3193288</pub-id>
</citation>
</ref>
<ref id="B58">
<label>58.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mu</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Learning to search a lightweight generalized network for medical image fusion</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2024</year>) <volume>34</volume>:<fpage>5921</fpage>&#x2013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2023.3342808</pub-id>
</citation>
</ref>
<ref id="B59">
<label>59.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>A robust mutual-reinforcing framework for 3d multi-modal medical image fusion based on visual-semantic consistency</article-title>. <source>Proc AAAI Conf Artif Intelligence</source> (<year>2024</year>) <volume>38</volume>:<fpage>7087</fpage>&#x2013;<lpage>95</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v38i7.28536</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>