<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1599937</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2025.1599937</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Infrared and visible image fusion driven by multimodal large language models</article-title>
<alt-title alt-title-type="left-running-head">Wang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2025.1599937">10.3389/fphy.2025.1599937</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Ke</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Dengshu</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cheng</surname>
<given-names>Yuan</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Che</surname>
<given-names>Yukui</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3015751/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Yuelin</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jiang</surname>
<given-names>Zhiwei</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Fengxian</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Wenjuan</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
</contrib-group>
<aff>
<institution>Qujing Power Supply Bureau</institution>, <institution>Yunnan Power Grid Co., Ltd.</institution>, <addr-line>Kunming</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1050324/overview">Yu Liu</ext-link>, Hefei University of Technology, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3018984/overview">Guohua Lv</ext-link>, Qilu University of Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3019341/overview">Min Li</ext-link>, Xinjiang University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Yukui Che, <email>454983185@qq.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>22</day>
<month>05</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1599937</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>03</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>05</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Wang, Hu, Cheng, Che, Li, Jiang, Chen and Li.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Wang, Hu, Cheng, Che, Li, Jiang, Chen and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Existing image fusion methods primarily focus on obtaining high-quality features from source images to enhance the quality of the fused image, often overlooking the impact of improved image quality on downstream task performance.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address this issue, this paper proposes a novel infrared and visible image fusion approach driven by multimodal large language models, aiming to improve the performance of pedestrian detection tasks. The proposed method fully considers how enhancing image quality can benefit pedestrian detection. By leveraging a multimodal large language model, we analyze the fused images based on user-provided questions related to improving pedestrian detection performance and generate suggestions for enhancing image quality. To better incorporate these suggestions, we design a Text-Driven Feature Harmonization (Text-DFH) module. Text-DFH refines the features produced by the fusion network according to the recommendations from the multimodal large language model, enabling the fused image to better meet the needs of pedestrian detection tasks.</p>
</sec>
<sec>
<title>Results</title>
<p>Compared with existing methods, the key advantage of our approach lies in utilizing the strong semantic understanding and scene analysis capabilities of multimodal large language models to provide precise guidance for improving fused image quality. As a result, our method enhances image quality while maintaining strong performance in pedestrian detection. Extensive qualitative and quantitative experiments on multiple public datasets validate the effectiveness and superiority of the proposed method.</p>
</sec>
<sec>
<title>Discussion</title>
<p>In addition to its effectiveness in infrared and visible image fusion, the method also demonstrates promising application potential in the field of nuclear medical imaging.</p>
</sec>
</abstract>
<kwd-group>
<kwd>infrared and visible image fusion</kwd>
<kwd>pedestrian detection</kwd>
<kwd>multimodal large language models</kwd>
<kwd>text-guided</kwd>
<kwd>model fine-tuning</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Radiation Detectors and Imaging</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Multimodal sensor technology has facilitated the application of multimodal images across various fields. Among them, infrared and visible images have been widely used in diverse tasks due to the complementary nature of the information they contain. Specifically, infrared images provide thermal radiation information of objects and are not affected by lighting conditions, but they lack detailed textures. In contrast, visible images capture rich texture details of the scene but are highly sensitive to lighting variations. Therefore, numerous methods [<xref ref-type="bibr" rid="B1">1</xref>&#x2013;<xref ref-type="bibr" rid="B7">7</xref>] have focused on fusing infrared and visible images, aiming to integrate the complementary information from both modalities into a single, more informative fused image. This facilitates better decision-making and judgment in downstream tasks such as object detection [<xref ref-type="bibr" rid="B8">8</xref>&#x2013;<xref ref-type="bibr" rid="B10">10</xref>] and semantic segmentation [<xref ref-type="bibr" rid="B11">11</xref>&#x2013;<xref ref-type="bibr" rid="B14">14</xref>].</p>
<p>Current approaches that jointly train infrared-visible image fusion with downstream tasks can be broadly categorized into two types: independent optimization and joint optimization. Independent optimization methods first train a fusion network for infrared and visible images and then use the resulting fused images to train a downstream task network, as shown in <xref ref-type="fig" rid="F1">Figure 1a</xref>. Consequently, most independent optimization methods focus on improving fusion quality, for example, by designing new network architectures [<xref ref-type="bibr" rid="B15">15</xref>&#x2013;<xref ref-type="bibr" rid="B19">19</xref>] or introducing specific constraints [<xref ref-type="bibr" rid="B20">20</xref>&#x2013;<xref ref-type="bibr" rid="B23">23</xref>]. However, such approaches neglect the potential guidance from downstream tasks and fail to establish a deep connection between fusion and task performance, often leading to suboptimal results. Simply chaining the fusion and downstream networks makes it difficult for the fused image to specifically cater to the downstream task&#x2019;s requirements. On the other hand, joint optimization methods use the downstream task network as a constraint to train the image fusion network, thereby forcing it to produce fused images that meet task-specific needs [<xref ref-type="bibr" rid="B24">24</xref>&#x2013;<xref ref-type="bibr" rid="B28">28</xref>], as illustrated in <xref ref-type="fig" rid="F1">Figure 1b</xref>. Nevertheless, the effectiveness of directly using high-level vision task supervision to guide fusion remains limited.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Comparison of different joint training strategies for image fusion and downstream tasks.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g001.tif"/>
</fig>
<p>Recently, Multimodal Large Language Models (MLLMs) have gained popularity due to their strong capability in modeling data across different modalities, such as images and text. For instance, Text-IF [<xref ref-type="bibr" rid="B29">29</xref>] and TeRF [<xref ref-type="bibr" rid="B30">30</xref>] leverage large models to encode user instructions and guide various types of fusion tasks. However, these methods do not consider the possibility of using large language models to feed back the specific needs of high-level vision tasks to the image fusion process, which could further improve the quality of fused images.</p>
<p>To address this challenge, we propose a novel infrared and visible image fusion method driven by a Multimodal Large Language Model, aiming to simultaneously enhance fusion quality and pedestrian detection accuracy, as shown in <xref ref-type="fig" rid="F1">Figure 1c</xref>. By leveraging the deep semantic understanding and scene analysis capabilities of MLLMs, we provide precise guidance for improving fused image quality while ensuring better pedestrian detection performance. Specifically, our method analyzes the fused images based on user-provided questions related to pedestrian detection, then generates optimization suggestions using feedback from the language model. To fully utilize these suggestions, we design a Text-Driven Feature Harmonization (Text-DFH) module, which refines the fusion network&#x2019;s output features under the guidance of the MLLM, allowing the fused images to better meet the demands of pedestrian detection.</p>
<p>In summary, the main contributions of this paper are as follows:<list list-type="simple">
<list-item>
<p>(1) We are the first to leverage Multimodal Large Language Models to provide feedback on the quality of fused images based on the specific requirements of downstream tasks, thus further improving infrared and visible image fusion.</p>
</list-item>
<list-item>
<p>(2) We propose an effective Text-Driven Feature Harmonization (Text-DFH) module that enables text-based guidance to assist in enhancing image quality.</p>
</list-item>
<list-item>
<p>(3) Our proposed method achieves excellent performance in infrared and visible image fusion, nuclear medical imaging, and pedestrian detection across multiple datasets.</p>
</list-item>
</list>
</p>
<p>The remainder of this paper is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> provides a brief overview of related work on multimodal large language models, infrared and visible image fusion, and pedestrian detection. <xref ref-type="sec" rid="s3">Section 3</xref> presents our proposed method in detail. <xref ref-type="sec" rid="s4">Section 4</xref> discusses the experimental results and analysis. <xref ref-type="sec" rid="s5">Section 5</xref> concludes the paper.</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>In this section, we first briefly introduce multimodal large language models, and then review existing infrared and visible image fusion methods.</p>
<sec id="s2-1">
<title>2.1 Multimodal large language models</title>
<p>With the advent of the multimodal data fusion era, the capability of unimodal systems is no longer sufficient to handle complex real-world tasks. As a result, multimodal large language models (MLLMs) have been proposed to integrate information from multiple data sources, enabling more comprehensive and accurate representations. These models have demonstrated significant practical value across various domains, including natural language processing, vision tasks, and audio tasks. In the visual domain, MLLMs enhance the performance of tasks such as image classification, object detection, and image captioning by combining textual descriptions with visual instructions. For example, GPT-4V [<xref ref-type="bibr" rid="B31">31</xref>] and Gemini [<xref ref-type="bibr" rid="B32">32</xref>] integrate image content with natural language descriptions to produce more vivid and accurate annotations. NExT-GPT [<xref ref-type="bibr" rid="B33">33</xref>] and Sora [<xref ref-type="bibr" rid="B34">34</xref>] are at the forefront of multimodal video generation, producing rich and realistic content by learning from multimodal data. Moreover, VideoChat [<xref ref-type="bibr" rid="B35">35</xref>] and Video-LLaVA [<xref ref-type="bibr" rid="B36">36</xref>] demonstrate excellent capabilities in analyzing and understanding video content in intelligent video understanding scenarios.</p>
<p>In the field of image fusion, Text-IF [<xref ref-type="bibr" rid="B29">29</xref>] and MGFusion [<xref ref-type="bibr" rid="B37">37</xref>] uses CLIP [<xref ref-type="bibr" rid="B38">38</xref>] to encode user requirement texts, guiding the model to fuse images. TeRF [<xref ref-type="bibr" rid="B30">30</xref>] utilizes LLaMA [<xref ref-type="bibr" rid="B39">39</xref>] to encode user instruction texts and generate prompts for guiding image fusion across different tasks. Although these methods employ MLLMs to tackle some challenges in image fusion, they do not consider the specific requirements of high-level downstream visual tasks for image fusion quality, which limits the application of infrared and visible image fusion in such tasks.</p>
</sec>
<sec id="s2-2">
<title>2.2 Infrared and visible image fusion</title>
<p>Conventional infrared and visible image fusion methods mainly focus on designing sophisticated feature extraction networks and fusion strategies to ensure the quality of the fused results. From the perspective of network design, these methods can be broadly categorized into CNN-based methods, CNN-Transformer hybrid methods, and GAN-based methods. CNN-based methods [<xref ref-type="bibr" rid="B40">40</xref>&#x2013;<xref ref-type="bibr" rid="B45">45</xref>] typically apply convolution, activation, and pooling operations to extract features from the input images, then fuse and reconstruct the final result using the extracted features. However, since CNNs can only perceive local features within a limited receptive field, they struggle to capture long-range contextual information, limiting their representational capacity. In contrast, Transformers [<xref ref-type="bibr" rid="B46">46</xref>] are better at modeling long-range dependencies and are more suited for capturing global features in images. ViT [<xref ref-type="bibr" rid="B47">47</xref>] was the first to introduce Transformer architectures into computer vision, achieving promising results. Subsequently, to combine the respective strengths of CNNs and Transformers, hybrid methods have gained increasing attention in the image fusion domain. For instance, CGTF [<xref ref-type="bibr" rid="B48">48</xref>], SwinFusion [<xref ref-type="bibr" rid="B16">16</xref>], YDTR [<xref ref-type="bibr" rid="B17">17</xref>], and DATFuse [<xref ref-type="bibr" rid="B49">49</xref>] insert Transformer layers after CNN layers to jointly leverage local and global feature extraction. CDDFuse [<xref ref-type="bibr" rid="B50">50</xref>] and EMMA [<xref ref-type="bibr" rid="B51">51</xref>] adopt dual-branch architectures combining CNNs and Transformers to simultaneously extract features from the input images and integrate them for fusion.</p>
<p>GAN-based methods enhance the model&#x2019;s feature extraction capabilities by introducing adversarial learning between generators and discriminators. Depending on the number of discriminators used, these methods can be classified into single-discriminator and dual-discriminator approaches. Single-discriminator methods [<xref ref-type="bibr" rid="B2">2</xref>, <xref ref-type="bibr" rid="B52">52</xref>] tend to favor one modality over the other, potentially leading to information loss and reduced visual quality of the fusion results. To address this, dual-discriminator methods [<xref ref-type="bibr" rid="B53">53</xref>&#x2013;<xref ref-type="bibr" rid="B56">56</xref>] are proposed to preserve important features from both source images simultaneously.</p>
<p>However, all of these methods primarily focus on designing effective feature extraction networks to produce high-quality fusion features and images. They overlook how fusion quality impacts downstream task performance, and fail to consider the potential feedback from downstream tasks that could help guide fusion more effectively.</p>
</sec>
<sec id="s2-3">
<title>2.3 Pedestrian detection</title>
<p>Pedestrian detection is a fundamental problem in computer vision with a wide range of applications. Cascade R-CNN [<xref ref-type="bibr" rid="B57">57</xref>] extends R-CNN [<xref ref-type="bibr" rid="B58">58</xref>] into a multi-stage framework, improving the ability to filter hard negative samples. Faster R-CNN [<xref ref-type="bibr" rid="B59">59</xref>] introduces a Region Proposal Network (RPN) that shares convolutional features with the detection network, making region proposals nearly cost-free. YOLO [<xref ref-type="bibr" rid="B60">60</xref>] reformulates object detection as a regression problem, allowing real-time inference directly on images through a convolutional neural network. SSD [<xref ref-type="bibr" rid="B61">61</xref>] uses multi-scale feature maps and predefined anchors for pedestrian detection, addressing YOLO&#x2019;s limitations in detecting small objects. DETR [<xref ref-type="bibr" rid="B62">62</xref>] adopts a Transformer-based encoder-decoder architecture for object detection. BAS Wu et al. [<xref ref-type="bibr" rid="B63">63</xref>] learns to represent the whole foreground region by leveraging foreground guidance and domain constraints. CREAM [<xref ref-type="bibr" rid="B64">64</xref>] proposes a clustering-based method to enhance activation within target regions. Group R-CNN [<xref ref-type="bibr" rid="B65">65</xref>] builds instance groups to perform pedestrian detection from point annotations.</p>
<p>However, most pedestrian detection methods are designed for unimodal images, which often leads to degraded detection performance due to incomplete scene information. In this work, we perform pedestrian detection on fused infrared and visible images, and incorporate task-specific prompts generated by large language models. This not only improves the quality of the fused images but also enhances pedestrian detection performance.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methods</title>
<sec id="s3-1">
<title>3.1 Overview</title>
<p>As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, the proposed method consists of two training stages. The first stage is dedicated to training the Fusion Network, enabling it to perform basic infrared and visible image fusion. In the second stage, the parameters of the pretrained fusion network are frozen, and a Text-Driven Feature Harmonization (Text-DFH) module is trained to refine the fusion results to better align with the requirements of pedestrian detection.The fusion network is composed of three main components: an Infrared Image Feature Encoder (IR-Encoder), a Visible Image Feature Encoder (VI-Encoder), and a Fusion Feature Decoder (F-Decoder). The IR/VI-Encoders are responsible for extracting features from the input infrared and visible images, respectively, while the F-Decoder reconstructs the fused image based on the combined features.The Text-DFH module adjusts the features extracted by the IR/VI-Encoders based on responses from a Multimodal Large Language Model (MLLM), ensuring that the resulting fused image better satisfies the needs of pedestrian detection. In this work, we adopt LLaVA [<xref ref-type="bibr" rid="B66">66</xref>] as the MLLM. LLaVA analyzes the unmodulated fused image and generates suggestions in response to user queries related to pedestrian detection tasks (e.g., To improve the accuracy of pedestrian detection, how can the quality of this image be enhanced?). More text examples of LLaVA answers are shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Overall framework of the proposed method. We use the IR-Encoder and VI-Encoder to extract features from the infrared and visible images, respectively. To ensure that the fused output meets the requirements of the pedestrian detection task, we input both a question related to pedestrian detection (e.g., To improve the accuracy of pedestrian detection, how can the quality of this image be enhanced?) and the unmodulated fused image into a Multimodal Large Language Model. The model provides suggestions for improving the quality of the fused image. Based on these suggestions, the Text-DFH module refines the output features of the fusion network, so that the final fusion result better aligns with the needs of the pedestrian detection task.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g002.tif"/>
</fig>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Visualized images of text examples of LLaVA answers.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g003.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Feature extraction and fusion</title>
<p>In the first training stage, we train the fusion network to perform the basic task of infrared and visible image fusion. The fusion network primarily consists of three components: the IR-Encoder, VI-Encoder, and F-Decoder. Each of the IR-Encoder, VI-Encoder, and F-Decoder is composed of three feature extraction layers. Each layer is constructed by stacking a convolutional layer (kernel size &#x3d; <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, stride &#x3d; 1), a Batch Normalization layer, and a LeakyReLU activation function. It is worth noting that the LeakyReLU activation function in the final feature extraction layer of the F-Decoder is replaced with a Tanh activation function to facilitate image reconstruction. We input the infrared image <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the visible image <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into the IR-Encoder and VI-Encoder, respectively, to extract features <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. To reconstruct the fused image, we concatenate <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> along the channel dimension and feed the result into the F-Decoder, which generates the final fused image <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>To encourage the fused image to retain as much scene information from the source images as possible, we introduce an intensity loss <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and an edge loss <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which together form the fusion loss <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e1">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>Here, <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes a hyperparameter used to balance the contribution of each sub-loss term. The intensity loss <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is defined as:<disp-formula id="e2">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>The edge loss <inline-formula id="inf14">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is defined as:<disp-formula id="e3">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>Here, <inline-formula id="inf15">
<mml:math id="m18">
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf16">
<mml:math id="m19">
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the height and width of the fused image, respectively; <inline-formula id="inf17">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the l1-norm, and <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the Sobel edge extraction operator.</p>
</sec>
<sec id="s3-3">
<title>3.3 Text-driven feature harmonization</title>
<p>In the second training stage, we freeze the parameters of the pretrained fusion network and focus on training the Text-DFH module to ensure that the fusion results meet the requirements of the pedestrian detection task. Text-DFH refines the features output by the IR/VI-Encoders in the fusion network based on the responses from the multimodal large language model, enabling the fused image to better align with the needs of pedestrian detection. As shown in <xref ref-type="fig" rid="F4">Figure 4</xref>, Text-DFH mainly consists of a dual-branch Cross Attention (CA) module and three feature extraction layers. The dual-branch cross attention computes the cross-attention between the features extracted by the IR/VI-Encoders and the textual features, allowing the model to extract useful information from the text that can help improve pedestrian detection accuracy. Subsequently, the three feature extraction layers integrate this textual information with the image scene features to generate refined features. The structure of the CA module is similar to the Multi-Scale Attention (MSA) module used in DATFuse.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Text-driven feature harmonization module.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g004.tif"/>
</fig>
<p>We input the infrared image <inline-formula id="inf19">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and visible image <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into the pretrained fusion network with frozen parameters to obtain the fused image <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. To obtain effective textual feedback that helps ensure the fused image meets the requirements of the pedestrian detection task, we input both <inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the text prompt &#x201c;To improve the accuracy of pedestrian detection, how can the quality of this image be enhanced?&#x201d; into LLaVA, resulting in the textual feature <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. We then input the outputs <inline-formula id="inf24">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the IR/VI-Encoders and the textual feature <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> into Text-DFH to harmonize the information in <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. To comprehensively extract the task-relevant information from the textual features, we design a dual-branch processing strategy. In the first branch, we take <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as the Query (Q) and <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as the Key (K) and Value (V) for cross-attention computation:<disp-formula id="e4">
<mml:math id="m32">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>Here, <inline-formula id="inf29">
<mml:math id="m33">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represents the features injected with textual information in the first branch, <inline-formula id="inf30">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the dimensionality of <inline-formula id="inf31">
<mml:math id="m35">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf32">
<mml:math id="m36">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf33">
<mml:math id="m37">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf34">
<mml:math id="m38">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. In the second branch, we use <inline-formula id="inf35">
<mml:math id="m39">
<mml:mrow>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as the Query (Q) and <inline-formula id="inf36">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as the Key (K) and Value (V) for cross-attention computation:<disp-formula id="e5">
<mml:math id="m41">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>Here, <inline-formula id="inf37">
<mml:math id="m42">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represents the features injected with textual information in the second branch, <inline-formula id="inf38">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the dimensionality of <inline-formula id="inf39">
<mml:math id="m44">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf40">
<mml:math id="m45">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf41">
<mml:math id="m46">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf42">
<mml:math id="m47">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. To comprehensively aggregate the textual information, we concatenate <inline-formula id="inf43">
<mml:math id="m48">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf44">
<mml:math id="m49">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> along the channel dimension and feed the result into three feature extraction layers to obtain the harmonized features <inline-formula id="inf45">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We then concatenate <inline-formula id="inf46">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf47">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> along the channel dimension and input the result into the F-Decoder to reconstruct the refined fused image <inline-formula id="inf48">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>To ensure that the refined fused image <inline-formula id="inf49">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> meets the requirements of the pedestrian detection task, we introduce a pretrained pedestrian detection network with frozen parameters to supervise the fused image. We input <inline-formula id="inf50">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into the detection network and obtain the pedestrian detection result <inline-formula id="inf51">
<mml:math id="m56">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. To make <inline-formula id="inf52">
<mml:math id="m57">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> as close as possible to its ground truth <inline-formula id="inf53">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we constrain the Text-DFH module using the loss function <inline-formula id="inf54">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which is the same as the one used during the training of YOLOv5.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Experiments</title>
<sec id="s4-1">
<title>4.1 Datasets</title>
<p>The proposed method consists of two training stages. In both the first and second training stages, we train the fusion network and the text-driven feature harmonization module on the publicly available LLVIP dataset [<xref ref-type="bibr" rid="B67">67</xref>], respectively, in accordance with standard practices in the field [<xref ref-type="bibr" rid="B68">68</xref>&#x2013;<xref ref-type="bibr" rid="B70">70</xref>]. Specifically, we randomly select 2,000 pairs of infrared and visible images from the LLVIP dataset as the training set. To enhance the diversity of training samples, we apply random flipping, random rotation, and random cropping as data augmentation techniques. For evaluation, we randomly select 200 pairs of infrared and visible images from each of the LLVIP, <inline-formula id="inf55">
<mml:math id="m60">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD [<xref ref-type="bibr" rid="B71">71</xref>], and MSRS [<xref ref-type="bibr" rid="B3">3</xref>] datasets to form the test set, in order to assess both the fusion performance and pedestrian detection performance of the proposed method. Among them, LLVIP, <inline-formula id="inf56">
<mml:math id="m61">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD, and MSRS are used to evaluate fusion performance, while LLVIP is specifically used to evaluate pedestrian detection performance.</p>
</sec>
<sec id="s4-2">
<title>4.2 Implementation details</title>
<p>The proposed method involves two training stages. In the first stage, the fusion network is trained. In the second stage, the parameters of the fusion network are frozen, and the text-driven feature harmonization module is trained. Both training stages use the Adam optimizer to update the network parameters, with a batch size of 16 and a learning rate of <inline-formula id="inf57">
<mml:math id="m62">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The total number of training epochs is set to 100 for the first stage and 200 for the second stage. In addition, the hyperparameter <inline-formula id="inf58">
<mml:math id="m63">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is set to 0.2. The proposed method is implemented based on the PyTorch framework and is trained on a single NVIDIA RTX A6000 GPU.</p>
</sec>
<sec id="s4-3">
<title>4.3 Evaluation metrics</title>
<p>We adopt five commonly used objective evaluation metrics to quantitatively assess the fusion performance of the proposed method. These metrics include Edge Preservation Index <inline-formula id="inf59">
<mml:math id="m64">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B72">72</xref>, <xref ref-type="bibr" rid="B73">73</xref>], Chen-Varshney Index <inline-formula id="inf60">
<mml:math id="m65">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B74">74</xref>], Structural Similarity Index <inline-formula id="inf61">
<mml:math id="m66">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B75">75</xref>], Average Gradient <inline-formula id="inf62">
<mml:math id="m67">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B76">76</xref>], and Sum of Correlations of Differences <inline-formula id="inf63">
<mml:math id="m68">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B77">77</xref>]. <inline-formula id="inf64">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> measures how well edge information from the source images is preserved in the fused image. <inline-formula id="inf65">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> higher value indicates less loss of texture details in the fused image. <inline-formula id="inf66">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> evaluates fusion quality from the perspective of human visual perception; a lower value means the fused image aligns better with human visual preferences. <inline-formula id="inf67">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> quantifies the similarity between the fused image and the source images in terms of luminance, contrast, and structure. A higher value indicates less information difference between the fused and source images. <inline-formula id="inf68">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> measures the richness of gradient information in the fused image. A higher value means the fused image contains more detailed gradient content. <inline-formula id="inf69">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> assesses information loss during the fusion process by computing difference maps between the fused image and source images. A higher value indicates less distortion in the fused image. Among these, <inline-formula id="inf70">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf71">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf72">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf73">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are positive indicators, meaning a higher value indicates better fusion performance. <inline-formula id="inf74">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a negative indicator, meaning a lower value represents better fusion performance. In addition, to objectively evaluate the effectiveness of the fused images in the pedestrian detection task, we adopt three widely used metrics in the pedestrian detection domain for quantitative analysis: Mean Average Precision (mAP) at IoU threshold of 0.5 <inline-formula id="inf75">
<mml:math id="m80">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, mAP at IoU threshold of 0.75 <inline-formula id="inf76">
<mml:math id="m81">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>75</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and the averaged mAP at IoU threshold from 0.5 to 0.95 <inline-formula id="inf77">
<mml:math id="m82">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mn>95</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s4-4">
<title>4.4 Comparison with state-of-the-art methods</title>
<p>In this study, we conduct a series of qualitative and quantitative comparisons between the proposed method and eight state-of-the-art (SOTA) methods to verify its superiority in both fusion performance and pedestrian detection performance. These methods include AUIF [<xref ref-type="bibr" rid="B78">78</xref>], DATFuse [<xref ref-type="bibr" rid="B49">49</xref>], IVFWSR [<xref ref-type="bibr" rid="B79">79</xref>], LRRNet [<xref ref-type="bibr" rid="B80">80</xref>], MLFusion [<xref ref-type="bibr" rid="B81">81</xref>], TIMFusion [<xref ref-type="bibr" rid="B82">82</xref>], SwinFusion [<xref ref-type="bibr" rid="B16">16</xref>], and TextIF [<xref ref-type="bibr" rid="B29">29</xref>]. The comparative experiments are divided into two distinct groups: In the first group, we compare the fusion performance of our method with that of the SOTA methods. In the second group, we freeze the fusion networks of the compared methods and retrain their pedestrian detection networks using the corresponding fused results. The retrained detection networks are then used to perform pedestrian detection on the fused images. This setup is designed to demonstrate that our proposed method can achieve strong pedestrian detection performance without requiring retraining of the detection network.</p>
<sec id="s4-4-1">
<title>4.4.1 Fusion performance comparison</title>
<p>We conduct both quantitative and qualitative comparisons of the proposed method against AUIF, DATFuse, IVFWSR, LRRNet, MLFusion, TIMFusion, SwinFusion, and TextIF on the LLVIP, <inline-formula id="inf78">
<mml:math id="m83">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD, and MSRS datasets to validate the superiority of our method in terms of fusion performance. As shown in the enlarged regions of <xref ref-type="fig" rid="F5">Figure 5</xref>, our method effectively highlights the thermal radiation information from the infrared image while preserving fine texture details from the visible image. Compared to existing SOTA methods, the fused images produced by our method exhibit clearer local details as well as higher overall brightness and contrast at the global level. This not only improves visual quality but also facilitates better object recognition in downstream tasks. This advantage is also reflected in the quantitative evaluation results, as shown in <xref ref-type="table" rid="T1">Tables 1</xref>&#x2013;<xref ref-type="table" rid="T3">3</xref>. Specifically, our method achieves the lowest values in metric <inline-formula id="inf79">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and ranks first in both metrics <inline-formula id="inf80">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf81">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, indicating that the fused images contain richer edge information and are more consistent with human visual perception. In summary, both qualitative and quantitative results demonstrate that our proposed method offers significant improvements in fusion performance over the compared methods.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Visual comparison with SOTA methods. The top two rows, middle two rows, and bottom two rows of images are from the LLVIP, <inline-formula id="inf82">
<mml:math id="m87">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD, and MSRS datasets, respectively. The first and second columns show the infrared and visible source images, while the third to ninth columns display the fusion results produced by the compared methods.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g005.tif"/>
</fig>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Quantitative results on the LLVIP dataset. The best and second-best values for each evaluation metric are highlighted in red and blue, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf83">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf84">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf85">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf86">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf87">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">AUIF</td>
<td align="center">0.3869</td>
<td align="center">610.74</td>
<td align="center">1.2016</td>
<td align="center" style="color:#0000FF">3.5256</td>
<td align="center">1.3413</td>
</tr>
<tr>
<td align="center">DATFuse</td>
<td align="center">0.4548</td>
<td align="center">453.42</td>
<td align="center" style="color:#0000FF">1.3130</td>
<td align="center">3.1243</td>
<td align="center">1.3351</td>
</tr>
<tr>
<td align="center">IVFWSR</td>
<td align="center">0.2925</td>
<td align="center">512.77</td>
<td align="center">1.2348</td>
<td align="center">2.5252</td>
<td align="center">1.1235</td>
</tr>
<tr>
<td align="center">LRRNet</td>
<td align="center">0.4426</td>
<td align="center">534.89</td>
<td align="center">1.3022</td>
<td align="center">2.4625</td>
<td align="center">0.9999</td>
</tr>
<tr>
<td align="center">MLFusion</td>
<td align="center">0.3239</td>
<td align="center">523.41</td>
<td align="center">1.2624</td>
<td align="center">2.1613</td>
<td align="center">0.9966</td>
</tr>
<tr>
<td align="center">TIMFusion</td>
<td align="center">0.2325</td>
<td align="center">845.75</td>
<td align="center">1.1742</td>
<td align="center">2.1761</td>
<td align="center">0.5368</td>
</tr>
<tr>
<td align="center">SwinFusion</td>
<td align="center">0.4266</td>
<td align="center">598.53</td>
<td align="center">1.2743</td>
<td align="center">2.6346</td>
<td align="center">1.3527</td>
</tr>
<tr>
<td align="center">TextIF</td>
<td align="center" style="color:#0000FF">0.5235</td>
<td align="center" style="color:#0000FF">356.35</td>
<td align="center">1.3056</td>
<td align="center">3.4856</td>
<td align="center" style="color:#0000FF">1.4527</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center" style="color:#FF0000">0.5845</td>
<td align="center" style="color:#FF0000">287.43</td>
<td align="center" style="color:#FF0000">1.3441</td>
<td align="center" style="color:#FF0000">3.9867</td>
<td align="center" style="color:#FF0000">1.5462</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Quantitative results on the <inline-formula id="inf88">
<mml:math id="m93">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>M</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>FD dataset. The best and second-best values for each evaluation metric are highlighted in red and blue, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf89">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf90">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf91">
<mml:math id="m96">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf92">
<mml:math id="m97">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf93">
<mml:math id="m98">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">AUIF</td>
<td align="center">0.5425</td>
<td align="center">852.56</td>
<td align="center">1.3003</td>
<td align="center" style="color:#FF0000">6.6735</td>
<td align="center" style="color:#0000FF">1.5353</td>
</tr>
<tr>
<td align="center">DATFuse</td>
<td align="center">0.4854</td>
<td align="center">563.57</td>
<td align="center">1.3067</td>
<td align="center">4.8326</td>
<td align="center">1.3461</td>
</tr>
<tr>
<td align="center">IVFWSR</td>
<td align="center">0.4532</td>
<td align="center">722.22</td>
<td align="center">1.2735</td>
<td align="center">3.5628</td>
<td align="center">1.2452</td>
</tr>
<tr>
<td align="center">LRRNet</td>
<td align="center">0.5164</td>
<td align="center">579.55</td>
<td align="center" style="color:#FF0000">1.3735</td>
<td align="center">4.5624</td>
<td align="center">1.3461</td>
</tr>
<tr>
<td align="center">MLFusion</td>
<td align="center">0.4253</td>
<td align="center">689.44</td>
<td align="center">1.2835</td>
<td align="center">4.4527</td>
<td align="center">1.2687</td>
</tr>
<tr>
<td align="center">TIMFusion</td>
<td align="center">0.5352</td>
<td align="center">616.16</td>
<td align="center">1.2872</td>
<td align="center">4.3336</td>
<td align="center">1.2004</td>
</tr>
<tr>
<td align="center">SwinFusion</td>
<td align="center" style="color:#0000FF">0.5537</td>
<td align="center">588.24</td>
<td align="center">1.3086</td>
<td align="center">6.0463</td>
<td align="center">1.3456</td>
</tr>
<tr>
<td align="center">TextIF</td>
<td align="center">0.5423</td>
<td align="center" style="color:#0000FF">534.21</td>
<td align="center">1.2986</td>
<td align="center">6.4026</td>
<td align="center">1.5035</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center" style="color:#FF0000">0.5856</td>
<td align="center" style="color:#FF0000">454.45</td>
<td align="center" style="color:#0000FF">1.3095</td>
<td align="center" style="color:#0000FF">6.4561</td>
<td align="center" style="color:#FF0000">1.6187</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Quantitative results on the MSRS dataset. The best and second-best values for each evaluation metric are highlighted in red and blue, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf94">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf95">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf96">
<mml:math id="m101">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf97">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf98">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">AUIF</td>
<td align="center">0.1736</td>
<td align="center">799.97</td>
<td align="center">0.9853</td>
<td align="center">1.8844</td>
<td align="center">1.1963</td>
</tr>
<tr>
<td align="center">DATFuse</td>
<td align="center" style="color:#0000FF">0.6326</td>
<td align="center">416.67</td>
<td align="center">1.2421</td>
<td align="center" style="color:#0000FF">3.5481</td>
<td align="center" style="color:#0000FF">1.5641</td>
</tr>
<tr>
<td align="center">IVFWSR</td>
<td align="center">0.3464</td>
<td align="center">734.46</td>
<td align="center" style="color:#0000FF">1.3462</td>
<td align="center">2.1129</td>
<td align="center">1.3581</td>
</tr>
<tr>
<td align="center">LRRNet</td>
<td align="center">0.4263</td>
<td align="center">666.35</td>
<td align="center">1.2952</td>
<td align="center">2.5632</td>
<td align="center">1.0854</td>
</tr>
<tr>
<td align="center">MLFusion</td>
<td align="center">0.2656</td>
<td align="center">745.57</td>
<td align="center">1.3457</td>
<td align="center">2.6531</td>
<td align="center">1.2053</td>
</tr>
<tr>
<td align="center">TIMFusion</td>
<td align="center">0.3346</td>
<td align="center">1032.24</td>
<td align="center">1.1003</td>
<td align="center">2.6422</td>
<td align="center">1.1783</td>
</tr>
<tr>
<td align="center">SwinFusion</td>
<td align="center">0.4527</td>
<td align="center">439.46</td>
<td align="center">1.3163</td>
<td align="center">3.0042</td>
<td align="center">1.4828</td>
</tr>
<tr>
<td align="center">TextIF</td>
<td align="center">0.6125</td>
<td align="center" style="color:#0000FF">400.34</td>
<td align="center">1.3357</td>
<td align="center" style="color:#FF0000">3.6426</td>
<td align="center">1.5457</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center" style="color:#FF0000">0.6365</td>
<td align="center" style="color:#FF0000">334.23</td>
<td align="center" style="color:#FF0000">1.3537</td>
<td align="center">3.5474</td>
<td align="center" style="color:#FF0000">1.6854</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-4-2">
<title>4.4.2 Pedestrian detection performance comparison</title>
<p>A common practice to improve the performance of fusion networks in downstream tasks is to freeze the parameters of the fusion network and retrain the downstream task network based on the generated fused results. Such approaches are referred to as &#x201c;retraining methods.&#x201d; To evaluate the effectiveness of our proposed method in pedestrian detection, we perform both quantitative and qualitative comparisons against these retraining methods. As shown in <xref ref-type="fig" rid="F6">Figure 6</xref>, the pedestrian detection results of other methods often suffer from issues such as bounding boxes that fail to fully cover the pedestrians&#x2019; bodies, or boxes that include large amounts of irrelevant background, indicating insufficient detection accuracy. In contrast, the detection results produced by our method show significantly fewer irrelevant regions within the bounding boxes and more accurate box placement. This advantage is also clearly reflected in the quantitative results, as shown in <xref ref-type="table" rid="T4">Table 4</xref>. Our method achieves the highest scores in metrics <inline-formula id="inf99">
<mml:math id="m104">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf100">
<mml:math id="m105">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>75</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf101">
<mml:math id="m106">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mn>95</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, indicating superior performance in the pedestrian detection task compared to the other methods. In conclusion, our method demonstrates better performance than approaches that require retraining the pedestrian detection network, even without retraining. This highlights the effectiveness and advantage of our method in pedestrian detection tasks.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Qualitative comparison of pedestrian detection performance with &#x201c;retraining methods.&#x201d; The first and second columns show the infrared and visible source images, while the third to ninth columns display the pedestrian detection results of the compared methods.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g006.tif"/>
</fig>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Quantitative comparison of pedestrian detection performance with &#x201c;retraining methods.&#x201d; The best and second-best values for each evaluation metric are highlighted in red and blue, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf102">
<mml:math id="m107">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf103">
<mml:math id="m108">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>75</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf104">
<mml:math id="m109">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mn>95</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">AUIF</td>
<td align="center">98.2</td>
<td align="center" style="color:#0000FF">91.8</td>
<td align="center">74.4</td>
</tr>
<tr>
<td align="center">DATFuse</td>
<td align="center" style="color:#0000FF">99.0</td>
<td align="center">91.5</td>
<td align="center">74.3</td>
</tr>
<tr>
<td align="center">IVFWSR</td>
<td align="center">97.2</td>
<td align="center">89.6</td>
<td align="center">72.9</td>
</tr>
<tr>
<td align="center">LRRNet</td>
<td align="center">98.0</td>
<td align="center">90.8</td>
<td align="center">73.8</td>
</tr>
<tr>
<td align="center">MLFusion</td>
<td align="center">97.8</td>
<td align="center">89.9</td>
<td align="center">73.6</td>
</tr>
<tr>
<td align="center">TIMFusion</td>
<td align="center">97.9</td>
<td align="center">88.4</td>
<td align="center">74.0</td>
</tr>
<tr>
<td align="center">SwinFusion</td>
<td align="center">98.5</td>
<td align="center">90.4</td>
<td align="center">74.3</td>
</tr>
<tr>
<td align="center">TextIF</td>
<td align="center">98.9</td>
<td align="center">91.7</td>
<td align="center" style="color:#0000FF">74.6</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center" style="color:#FF0000">99.1</td>
<td align="center" style="color:#FF0000">92.8</td>
<td align="center" style="color:#FF0000">75.0</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-4-3">
<title>4.4.3 Analysis of application potential in medical image fusion</title>
<p>Furthermore, to validate the effectiveness and application potential of the proposed method in the field of nuclear medical imaging, we further deployed it in a medical image fusion task. Specifically, we conducted experiments on the BraTS2020 [<xref ref-type="bibr" rid="B83">83</xref>] dataset and performed both qualitative and quantitative analyses of the fusion results. As shown in <xref ref-type="fig" rid="F7">Figure 7</xref>, compared with state-of-the-art methods such as ALMFnet [<xref ref-type="bibr" rid="B84">84</xref>, <xref ref-type="bibr" rid="B85">85</xref>], and RMR-Fusion [<xref ref-type="bibr" rid="B86">86</xref>], the proposed method preserves more texture details and salient information in the fused medical images. As reported in <xref ref-type="table" rid="T5">Table 5</xref>, our method ranks first or second across most evaluation metrics. These results demonstrate the promising potential of the proposed method for applications in nuclear medical imaging.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Qualitative analysis results on the medical image fusion task.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g007.tif"/>
</fig>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Quantitative Analysis Results on the Medical Image Fusion Task. The best and second-best values for each evaluation metric are highlighted in red and blue, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf105">
<mml:math id="m110">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf106">
<mml:math id="m111">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf107">
<mml:math id="m112">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf108">
<mml:math id="m113">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf109">
<mml:math id="m114">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">ALMFnet</td>
<td align="center" style="color:#0000FF">0.4700</td>
<td align="center">1330.59</td>
<td align="center" style="color:#0000FF">1.3432</td>
<td align="center" style="color:#0000FF">3.5826</td>
<td align="center">1.2991</td>
</tr>
<tr>
<td align="center">EMMA</td>
<td align="center">0.4682</td>
<td align="center" style="color:#0000FF">1288.99</td>
<td align="center">1.3232</td>
<td align="center">3.1826</td>
<td align="center">1.2999</td>
</tr>
<tr>
<td align="center">RMR-Fusion</td>
<td align="center">0.4419</td>
<td align="center">1344.12</td>
<td align="center">1.2967</td>
<td align="center">3.2621</td>
<td align="center" style="color:#FF0000">1.3781</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center" style="color:#FF0000">0.4792</td>
<td align="center" style="color:#FF0000">1203.12</td>
<td align="center" style="color:#FF0000">1.3631</td>
<td align="center" style="color:#FF0000">3.7521</td>
<td align="center" style="color:#0000FF">1.3629</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4-5">
<title>4.5 Ablation study</title>
<p>The proposed method mainly consists of two core components: the Multimodal Large Language Model (MLLM) and the Text-Driven Feature Harmonization (Text-DFH) module. Within Text-DFH, both the text-guided cross-attention and the image-guided cross-attention play key roles. To validate the effectiveness of these components, we conduct a series of ablation experiments on the LLVIP dataset.</p>
<sec id="s4-5-1">
<title>4.5.1 Effectiveness of the multimodal large language model</title>
<p>We utilize the MLLM to analyze the fused images based on user-provided questions related to pedestrian detection performance and generate suggestions for improving image quality. To assess the contribution of the MLLM, we remove it and replace its feedback with a fixed text prompt: &#x201c;Brighter brightness, higher contrast, and clearer texture details.&#x201d; As shown in <xref ref-type="fig" rid="F8">Figure 8</xref>, the fusion results from the ablation model without the MLLM are noticeably inferior in visual quality compared to the full model. To further validate this, we perform quantitative analysis as presented in <xref ref-type="table" rid="T6">Table 6</xref>. The results show that the full model outperforms the ablation model on all evaluation metrics. Additionally, we analyze the performance of pedestrian detection, as shown in <xref ref-type="table" rid="T7">Table 7</xref> and <xref ref-type="fig" rid="F9">Figure 9</xref>. Both the quantitative and qualitative results indicate that the fused images produced by the ablation model without the MLLM lead to poorer detection performance. In contrast, the full model achieves better pedestrian detection results. In summary, both qualitative and quantitative analyses confirm the effectiveness of the Multimodal Large Language Model in our method.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Qualitative comparison of fusion performance across different ablation models. The first and second columns show the infrared and visible source images, while the third to seventh columns display the fusion results obtained under different ablation settings.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g008.tif"/>
</fig>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Quantitative comparison of fusion performance across different ablation models. The best and second-best values for each evaluation metric are highlighted in red and blue, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf110">
<mml:math id="m115">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
<mml:mo>/</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf111">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf112">
<mml:math id="m117">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SSIM</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf113">
<mml:math id="m118">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf114">
<mml:math id="m119">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SCD</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">w/o MLLM</td>
<td align="center">0.5472</td>
<td align="center" style="color:#0000FF">298.75</td>
<td align="center">1.3244</td>
<td align="center">3.6433</td>
<td align="center" style="color:#0000FF">1.5367</td>
</tr>
<tr>
<td align="center">w/o Text-DFH</td>
<td align="center">0.5763</td>
<td align="center">299.46</td>
<td align="center">1.3321</td>
<td align="center">3.4131</td>
<td align="center">1.4992</td>
</tr>
<tr>
<td align="center">w/o CA1</td>
<td align="center" style="color:#0000FF">0.5834</td>
<td align="center">305.92</td>
<td align="center">1.3234</td>
<td align="center">3.6362</td>
<td align="center">1.5213</td>
</tr>
<tr>
<td align="center">w/o CA2</td>
<td align="center">0.5798</td>
<td align="center">301.68</td>
<td align="center" style="color:#0000FF">1.3401</td>
<td align="center" style="color:#0000FF">3.6524</td>
<td align="center">1.5123</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center" style="color:#FF0000">0.5845</td>
<td align="center" style="color:#FF0000">287.43</td>
<td align="center" style="color:#FF0000">1.3441</td>
<td align="center" style="color:#FF0000">3.9867</td>
<td align="center" style="color:#FF0000">1.5462</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>Quantitative comparison of pedestrian detection performance across different ablation models. The best and second-best values for each evaluation metric are highlighted in red and blue, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">
<inline-formula id="inf115">
<mml:math id="m120">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf116">
<mml:math id="m121">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>75</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf117">
<mml:math id="m122">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mn>95</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">w/o MLLM</td>
<td align="center">98.5</td>
<td align="center">91.6</td>
<td align="center">73.9</td>
</tr>
<tr>
<td align="center">w/o Text-DFH</td>
<td align="center">98.8</td>
<td align="center">92.1</td>
<td align="center">74.0</td>
</tr>
<tr>
<td align="center">w/o CA1</td>
<td align="center" style="color:#0000FF">99.0</td>
<td align="center" style="color:#0000FF">92.4</td>
<td align="center" style="color:#0000FF">74.5</td>
</tr>
<tr>
<td align="center">w/o CA2</td>
<td align="center">98.9</td>
<td align="center">91.8</td>
<td align="center">74.4</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center" style="color:#FF0000">99.1</td>
<td align="center" style="color:#FF0000">92.8</td>
<td align="center" style="color:#FF0000">75.0</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Qualitative comparison of pedestrian detection performance across different ablation models. The first and second columns show the infrared and visible source images, while the third to seventh columns display the pedestrian detection results under different ablation settings.</p>
</caption>
<graphic xlink:href="fphy-13-1599937-g009.tif"/>
</fig>
</sec>
<sec id="s4-5-2">
<title>4.5.2 Effectiveness of Text-DFH</title>
<p>Text-DFH refines the output features of the fusion network based on suggestions from the multimodal large language model, enabling the fused image to better meet the requirements of the pedestrian detection task. To verify the effectiveness of Text-DFH, we remove it from the architecture and instead concatenate the text features with the image features to be refined along the channel dimension. The combined features are then processed by CNNs to obtain the refined output. We conduct both quantitative and qualitative analyses of the fusion performance of the model without Text-DFH, as shown in <xref ref-type="table" rid="T6">Table 6</xref> and <xref ref-type="fig" rid="F8">Figure 8</xref>. As observed, the ablation model without Text-DFH performs worse than the full model across multiple evaluation metrics, and the visual quality of the fused images is also inferior. In addition, we evaluate pedestrian detection performance both quantitatively and qualitatively, as presented in <xref ref-type="table" rid="T7">Table 7</xref> and <xref ref-type="fig" rid="F9">Figure 9</xref>. The full model achieves higher scores compared to the ablation model without Text-DFH. In summary, a series of experiments clearly demonstrate the effectiveness of the Text-DFH module.</p>
</sec>
<sec id="s4-5-3">
<title>4.5.3 Effectiveness of dual-branch cross attention</title>
<p>In the Text-DFH module, we refine image features using text features through a dual-branch cross attention mechanism. To verify its effectiveness, we remove the cross attention from each branch individually, leaving only a single branch to refine the image features. These variants are referred to as CA1 and CA2, respectively. From the quantitative and qualitative results on fusion performance, it is evident that removing either branch of the cross attention leads to a significant drop in performance, as shown in <xref ref-type="table" rid="T6">Table 6</xref> and <xref ref-type="fig" rid="F8">Figure 8</xref>. Furthermore, to assess the impact of dual-branch cross attention on pedestrian detection performance, we conduct both quantitative and qualitative analyses. The results demonstrate that pedestrian detection performance is optimal only when both branches of the cross attention are used to refine the image features, as shown in <xref ref-type="table" rid="T7">Table 7</xref> and <xref ref-type="fig" rid="F9">Figure 9</xref>. In conclusion, the above experiments confirm the effectiveness of the dual-branch cross attention mechanism.</p>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>To address the limitation of existing methods that primarily focus on improving fused image quality through network design&#x2014;while overlooking the potential benefits of enhanced image quality for pedestrian detection&#x2014;we propose a multimodal large language model (MLLM)-driven infrared and visible image fusion method. This method not only aims to improve the quality of the fused images but also emphasizes enhancing their performance in pedestrian detection tasks. By leveraging a multimodal large language model, we analyze the fused images based on user-provided questions related to improving pedestrian detection performance and generate suggestions for enhancing image quality. To fully utilize the guidance provided by the MLLM, we design a Text-Driven Feature Harmonization (Text-DFH) module, which refines the features output by the fusion network according to the textual suggestions. This ensures improved fusion quality while maintaining strong performance in pedestrian detection. In addition, the proposed method also demonstrates significant application potential in the field of nuclear medical imaging. However, under extreme weather conditions such as rain, fog, and snow, the fusion performance of the current method may degrade. Moreover, when such methods are applied to other types of source images [<xref ref-type="bibr" rid="B87">87</xref>&#x2013;<xref ref-type="bibr" rid="B90">90</xref>], their performance may degrade. In future work, we plan to extend this research to develop an infrared and visible image fusion framework tailored for extreme weather scenarios, striving to maintain robust downstream task performance even in challenging environments.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>KW: Project administration, Writing &#x2013; original draft, Writing &#x2013; review and editing, Investigation, Conceptualization, Methodology. DH: Formal Analysis, Writing &#x2013; review and editing, Data curation, Validation. YaC: Visualization, Supervision, Writing &#x2013; review and editing, Resources. YkC: Funding acquisition, Project administration, Supervision, Writing &#x2013; review and editing, Writing &#x2013; original draft. YL: Validation, Writing &#x2013; review and editing, Visualization, Formal Analysis. ZJ: Writing &#x2013; review and editing, Investigation, Data curation, Resources. FC: Formal Analysis, Writing &#x2013; review and editing, Data curation. WL: Writing &#x2013; review and editing, Resources, Visualization.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. Science and Technology Project of China Southern Power Grid Co., Ltd. (No. YNKJXM20240052).</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>Authors KW, DH, YaC, YkC, YL, ZJ, FC, and WL were employed by Yunnan Power Grid Co., Ltd.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that Generative AI was used in the creation of this manuscript. AI was only used to polish the paper.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X-J</given-names>
</name>
<name>
<surname>Kittler</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rfn-nest: an end-to-end residual fusion network for infrared and visible images</article-title>. <source>Inf Fusion</source> (<year>2021</year>) <volume>73</volume>:<fpage>72</fpage>&#x2013;<lpage>86</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2021.02.023</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Fusiongan: a generative adversarial network for infrared and visible image fusion</article-title>. <source>Inf Fusion</source> (<year>2019</year>) <volume>48</volume>:<fpage>11</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2018.09.004</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Piafusion: a progressive infrared and visible image fusion network based on illumination aware</article-title>. <source>Inf Fusion</source> (<year>2022</year>) <volume>83-84</volume>:<fpage>79</fpage>&#x2013;<lpage>92</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2022.03.007</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Infrared and visible image fusion via parallel scene and texture learning</article-title>. <source>Pattern Recognition</source> (<year>2022</year>) <volume>132</volume>:<fpage>108929</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2022.108929</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Du</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Chitnet: a complementary to harmonious information transfer network for infrared and visible image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2025</year>) <volume>74</volume>:<fpage>1</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1109/TIM.2025.3527523</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>ZJ</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Vdmufusion: a versatile diffusion model-based unsupervised framework for image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2025</year>) <volume>34</volume>:<fpage>441</fpage>&#x2013;<lpage>54</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2024.3512365</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Sima</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Sigfusion: semantic information-guided infrared and visible image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2024</year>) <volume>73</volume>:<fpage>1</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2024.3457951</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Dian</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Lraf-net: long-range attention fusion network for visible&#x2013;infrared object detection</article-title>. <source>IEEE Trans Neural Networks Learn Syst</source> (<year>2024</year>) <volume>35</volume>:<fpage>13232</fpage>&#x2013;<lpage>45</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2023.3266452</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Pang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Shao</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>Improving single shot object detection with feature scale unmixing</article-title>. <source>IEEE Trans Image Process</source> (<year>2021</year>) <volume>30</volume>:<fpage>2708</fpage>&#x2013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2020.3048630</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z-Q</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S-T</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Object detection with deep learning: a review</article-title>. <source>IEEE Trans Neural Networks Learn Syst</source> (<year>2019</year>) <volume>30</volume>:<fpage>3212</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2018.2876865</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Rethinking self-supervised semantic segmentation: achieving end-to-end segmentation</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2024</year>) <volume>46</volume>:<fpage>10036</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2024.3432326</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X</given-names>
</name>
<name>
<surname>He</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Querying labeled for unlabeled: cross-image semantic consistency guided semi-supervised semantic segmentation</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2023</year>) <volume>45</volume>:<fpage>8827</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3233584</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>A feature divide-and-conquer network for rgb-t semantic segmentation</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2023</year>) <volume>33</volume>:<fpage>2892</fpage>&#x2013;<lpage>905</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2022.3229359</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Jiao</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Mitigating modality discrepancies for rgb-t semantic segmentation</article-title>. <source>IEEE Trans Neural Networks Learn Syst</source> (<year>2024</year>) <volume>35</volume>:<fpage>9380</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2022.3233089</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Densefuse: a fusion approach to infrared and visible images</article-title>. <source>IEEE Trans Image Process</source> (<year>2019</year>) <volume>28</volume>:<fpage>2614</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2018.2887342</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Swinfusion: cross-domain long-range learning for general image fusion via swin transformer</article-title>. <source>IEEE/CAA J Automatica Sinica</source> (<year>2022</year>) <volume>9</volume>:<fpage>1200</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1109/jas.2022.105686</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>He</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Ydtr: infrared and visible image fusion via y-shape dynamic transformer</article-title>. <source>IEEE Trans Multimedia</source> (<year>2023</year>) <volume>25</volume>:<fpage>5413</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1109/tmm.2022.3192661</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Infrared and visible image fusion scheme based on nsct and low-level visual features</article-title>. <source>Infrared Phys and Technology</source> (<year>2016</year>) <volume>76</volume>:<fpage>174</fpage>&#x2013;<lpage>84</lpage>. <pub-id pub-id-type="doi">10.1016/j.infrared.2016.02.005</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Discriminative dictionary learning-based multiple component decomposition for detail-preserving noisy image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2020</year>) <volume>69</volume>:<fpage>1082</fpage>&#x2013;<lpage>102</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2019.2912239</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hou</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Nie</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Y</given-names>
</name>
<etal/>
</person-group> <article-title>Vif-net: an unsupervised framework for infrared and visible image fusion</article-title>. <source>IEEE Trans Comput Imaging</source> (<year>2020</year>) <volume>6</volume>:<fpage>640</fpage>&#x2013;<lpage>51</lpage>. <pub-id pub-id-type="doi">10.1109/tci.2020.2965304</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Shao</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Ganmcc: a generative adversarial network with multiclassification constraints for infrared and visible image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2021</year>) <volume>70</volume>:<fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2020.3038013</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Drf: disentangled representation for visible and infrared image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2021</year>) <volume>70</volume>:<fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2021.3056645</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Analysis-synthesis dictionary pair learning and patch saliency measure for image fusion</article-title>. <source>Signal Process.</source> (<year>2020</year>) <volume>167</volume>:<fpage>107327</fpage>. <pub-id pub-id-type="doi">10.1016/j.sigpro.2019.107327</pub-id>
</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rethinking the necessity of image fusion in high-level vision tasks: a practical infrared and visible image fusion network based on progressive semantic injection and scene fidelity</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>99</volume>:<fpage>101870</fpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2023.101870</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>W</given-names>
</name>
<etal/>
</person-group> <article-title>Multi-interactive feature learning and a full-time multi-modality benchmark for image fusion and segmentation</article-title>. In: <source>Proceedings of the IEEE/CVF international conference on computer vision (ICCV)</source> (<year>2023</year>). p. <fpage>8115</fpage>&#x2013;<lpage>24</lpage>.</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Mrfs: mutually reinforcing image fusion and segmentation</article-title>. In: <source>2024 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2024</year>). p. <fpage>26964</fpage>&#x2013;<lpage>73</lpage>.</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>An interactively reinforced paradigm for joint infrared-visible image fusion and saliency object detection</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>98</volume>:<fpage>101828</fpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2023.101828</pub-id>
</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Instruction-driven fusion of infrared&#x2013;visible images: tailoring for diverse downstream tasks</article-title>. <source>Inf Fusion</source> (<year>2025</year>) <volume>121</volume>:<fpage>103148</fpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2025.103148</pub-id>
</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yi</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Text-if: leveraging semantic text guidance for degradation-aware and interactive image fusion</article-title>. In: <source>2024 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2024</year>). p. <fpage>27016</fpage>&#x2013;<lpage>25</lpage>.</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yi</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Xiang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Terf: text-driven and region-aware flexible visible and infrared image fusion</article-title>. In: <source>Proceedings of the 32nd ACM international conference on multimedia</source> (<year>2024</year>). p. <fpage>935</fpage>&#x2013;<lpage>44</lpage>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Achiam</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Adler</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Agarwal</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Ahmad</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Akkaya</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Aleman</surname>
<given-names>FL</given-names>
</name>
<etal/>
</person-group> <article-title>Gpt-4 technical report</article-title>. <source>arXiv preprint arXiv:2303.08774</source> (<year>2023</year>).</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Team</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Anil</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Borgeaud</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Alayrac</surname>
<given-names>J-B</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Soricut</surname>
<given-names>R</given-names>
</name>
<etal/>
</person-group> <article-title>Gemini: a family of highly capable multimodal models</article-title>. <source>arXiv preprint arXiv:2312.11805</source> (<year>2023</year>).</citation>
</ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Fei</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Chua</surname>
<given-names>T-S</given-names>
</name>
</person-group>. <article-title>Next-gpt: any-to-any multimodal llm</article-title>. In: <source>Forty-first international conference on machine learning</source> (<year>2024</year>).</citation>
</ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R</given-names>
</name>
<etal/>
</person-group> <article-title>Sora: a review on background, technology, limitations, and opportunities of large vision models</article-title>. <source>arXiv preprint arXiv:2402</source> (<year>2024</year>).</citation>
</ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>K</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>P</given-names>
</name>
<etal/>
</person-group> <article-title>Videochat: chat-centric video understanding</article-title>. <source>arXiv preprint arXiv:2305.06355</source> (<year>2023</year>).</citation>
</ref>
<ref id="B36">
<label>36.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ning</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>P</given-names>
</name>
<etal/>
</person-group> <article-title>Video-llava: learning united visual representation by alignment before projection</article-title>. <source>arXiv preprint arXiv:2311</source> (<year>2023</year>).</citation>
</ref>
<ref id="B37">
<label>37.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Mgfusion: a multimodal large language model-guided information perception for infrared and visible image fusion</article-title>. <source>Front Neurorobotics</source> (<year>2024</year>) <volume>18</volume>:<fpage>1521603</fpage>. <pub-id pub-id-type="doi">10.3389/fnbot.2024.1521603</pub-id>
</citation>
</ref>
<ref id="B38">
<label>38.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Radford</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>JW</given-names>
</name>
<name>
<surname>Hallacy</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Ramesh</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Goh</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Agarwal</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Learning transferable visual models from natural language supervision</article-title>. In: <source>
<italic>International conference on machine learning</italic> (PmLR)</source> (<year>2021</year>). p. <fpage>8748</fpage>&#x2013;<lpage>63</lpage>.</citation>
</ref>
<ref id="B39">
<label>39.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Touvron</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Lavril</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Izacard</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Martinet</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Lachaux</surname>
<given-names>M-A</given-names>
</name>
<name>
<surname>Lacroix</surname>
<given-names>T</given-names>
</name>
<etal/>
</person-group> <article-title>Llama: open and efficient foundation language models</article-title>. <source>arXiv preprint arXiv:2302</source> (<year>2023</year>).</citation>
</ref>
<ref id="B40">
<label>40.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ling</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>U2fusion: a unified unsupervised image fusion network</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2022</year>) <volume>44</volume>:<fpage>502</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2020.3012548</pub-id>
</citation>
</ref>
<ref id="B41">
<label>41.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>Ifcnn: a general image fusion framework based on convolutional neural network</article-title>. <source>Inf Fusion</source> (<year>2020</year>) <volume>54</volume>:<fpage>99</fpage>&#x2013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2019.07.011</pub-id>
</citation>
</ref>
<ref id="B42">
<label>42.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Sdnet: a versatile squeeze-and-decomposition network for real-time image fusion</article-title>. <source>Int J Computer Vis</source> (<year>2021</year>) <volume>129</volume>:<fpage>2761</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-021-01501-8</pub-id>
</citation>
</ref>
<ref id="B43">
<label>43.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Mulfs-cap: multimodal fusion-supervised cross-modality alignment perception for unregistered infrared-visible image fusion</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2025</year>) <volume>47</volume>:<fpage>3673</fpage>&#x2013;<lpage>90</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2025.3535617</pub-id>
</citation>
</ref>
<ref id="B44">
<label>44.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Feature dynamic alignment and refinement for infrared&#x2013;visible image fusion:translation robust fusion</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>95</volume>:<fpage>26</fpage>&#x2013;<lpage>41</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2023.02.011</pub-id>
</citation>
</ref>
<ref id="B45">
<label>45.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Heterogeneous knowledge distillation for simultaneous infrared-visible image fusion and super-resolution</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2022</year>) <volume>71</volume>:<fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2022.3149101</pub-id>
</citation>
</ref>
<ref id="B46">
<label>46.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vaswani</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Shazeer</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Parmar</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Uszkoreit</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Gomez</surname>
<given-names>AN</given-names>
</name>
<etal/>
</person-group> <article-title>Attention is all you need</article-title>. <source>Adv Neural Inf Process Syst</source> (<year>2017</year>) <volume>30</volume>.</citation>
</ref>
<ref id="B47">
<label>47.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Beyer</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Kolesnikov</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Weissenborn</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Zhai</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Unterthiner</surname>
<given-names>T</given-names>
</name>
<etal/>
</person-group> <article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>. <source>arXiv preprint arXiv:2010.11929</source> (<year>2020</year>).</citation>
</ref>
<ref id="B48">
<label>48.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>B</given-names>
</name>
</person-group>. <article-title>Cgtf: convolution-guided transformer for infrared and visible image fusion</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2022</year>) <volume>71</volume>:<fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2022.3175055</pub-id>
</citation>
</ref>
<ref id="B49">
<label>49.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>He</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Si</surname>
<given-names>T</given-names>
</name>
</person-group>. <article-title>Datfuse: infrared and visible image fusion via dual attention transformer</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2023</year>) <volume>33</volume>:<fpage>3159</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2023.3234340</pub-id>
</citation>
</ref>
<ref id="B50">
<label>50.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Z</given-names>
</name>
<etal/>
</person-group> <article-title>Cddfuse: correlation-driven dual-branch feature decomposition for multi-modality image fusion</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2023</year>). p. <fpage>5906</fpage>&#x2013;<lpage>16</lpage>.</citation>
</ref>
<ref id="B51">
<label>51.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Equivariant multi-modality image fusion</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<year>2024</year>). p. <fpage>25912</fpage>&#x2013;<lpage>21</lpage>.</citation>
</ref>
<ref id="B52">
<label>52.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>Infrared and visible image fusion via detail preserving adversarial learning</article-title>. <source>Inf Fusion</source> (<year>2020</year>) <volume>54</volume>:<fpage>85</fpage>&#x2013;<lpage>98</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2019.07.005</pub-id>
</citation>
</ref>
<ref id="B53">
<label>53.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Ddcgan: a dual-discriminator conditional generative adversarial network for multi-resolution image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2020</year>) <volume>29</volume>:<fpage>4980</fpage>&#x2013;<lpage>95</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2020.2977573</pub-id>
</citation>
</ref>
<ref id="B54">
<label>54.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Huo</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Attentionfgan: infrared and visible image fusion using attention-based generative adversarial networks</article-title>. <source>IEEE Trans Multimedia</source> (<year>2021</year>) <volume>23</volume>:<fpage>1383</fpage>&#x2013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1109/tmm.2020.2997127</pub-id>
</citation>
</ref>
<ref id="B55">
<label>55.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ling</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Semantic-supervised infrared and visible image fusion via a dual-discriminator generative adversarial network</article-title>. <source>IEEE Trans Multimedia</source> (<year>2021</year>) <volume>25</volume>:<fpage>635</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1109/tmm.2021.3129609</pub-id>
</citation>
</ref>
<ref id="B56">
<label>56.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Gan-fm: infrared and visible image fusion using gan with full-scale skip connection and dual markovian discriminators</article-title>. <source>IEEE Trans Comput Imaging</source> (<year>2021</year>) <volume>7</volume>:<fpage>1134</fpage>&#x2013;<lpage>47</lpage>. <pub-id pub-id-type="doi">10.1109/tci.2021.3119954</pub-id>
</citation>
</ref>
<ref id="B57">
<label>57.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cai</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Vasconcelos</surname>
<given-names>N</given-names>
</name>
</person-group>. <article-title>Cascade r-cnn: high quality object detection and instance segmentation</article-title>. <source>IEEE Trans pattern Anal machine intelligence</source> (<year>2019</year>) <volume>43</volume>:<fpage>1483</fpage>&#x2013;<lpage>98</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2019.2956516</pub-id>
</citation>
</ref>
<ref id="B58">
<label>58.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Donahue</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Malik</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rich feature hierarchies for accurate object detection and semantic segmentation</article-title>. In: <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source> (<year>2014</year>). p. <fpage>580</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B59">
<label>59.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>S</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Faster r-cnn: towards real-time object detection with region proposal networks</article-title>. <source>Adv Neural Inf Process Syst</source> (<year>2015</year>) <volume>28</volume>.</citation>
</ref>
<ref id="B60">
<label>60.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Divvala</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>You only look once: unified, real-time object detection</article-title>. In: <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source> (<year>2016</year>). p. <fpage>779</fpage>&#x2013;<lpage>88</lpage>.</citation>
</ref>
<ref id="B61">
<label>61.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Anguelov</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Erhan</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Reed</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>C-Y</given-names>
</name>
<etal/>
</person-group> <article-title>Ssd: single shot multibox detector</article-title>. In: <source>Computer vision&#x2013;ECCV 2016: 14th European conference, Amsterdam, The Netherlands, october 11&#x2013;14, 2016, proceedings, Part I 14</source>. <publisher-name>Springer</publisher-name> (<year>2016</year>). p. <fpage>21</fpage>&#x2013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B62">
<label>62.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Carion</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Massa</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Synnaeve</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Usunier</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Kirillov</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Zagoruyko</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>End-to-end object detection with transformers</article-title>. In: <source>European conference on computer vision</source>. <publisher-name>Springer</publisher-name> (<year>2020</year>). p. <fpage>213</fpage>&#x2013;<lpage>29</lpage>.</citation>
</ref>
<ref id="B63">
<label>63.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Zhai</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Background activation suppression for weakly supervised object localization</article-title>. In: <source>2022 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>. <publisher-name>IEEE</publisher-name> (<year>2022</year>). p. <fpage>14228</fpage>&#x2013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B64">
<label>64.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>R-W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T</given-names>
</name>
<etal/>
</person-group> <article-title>Cream: weakly supervised object localization via class re-activation mapping</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<year>2022</year>). p. <fpage>9437</fpage>&#x2013;<lpage>46</lpage>.</citation>
</ref>
<ref id="B65">
<label>65.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>K</given-names>
</name>
</person-group>. <article-title>Group r-cnn for weakly semi-supervised object detection with points</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<year>2022</year>). p. <fpage>9417</fpage>&#x2013;<lpage>26</lpage>.</citation>
</ref>
<ref id="B66">
<label>66.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>YJ</given-names>
</name>
</person-group>. <article-title>Visual instruction tuning</article-title>. <source>Adv Neural Inf Process Syst</source> (<year>2023</year>) <volume>36</volume>:<fpage>34892</fpage>&#x2013;<lpage>916</lpage>.</citation>
</ref>
<ref id="B67">
<label>67.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jia</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>W</given-names>
</name>
</person-group>. <article-title>Llvip: a visible-infrared paired dataset for low-light vision</article-title>. In: <source>Proceedings of the IEEE/CVF international conference on computer vision workshops (ICCVW)</source> (<year>2021</year>). p. <fpage>3496</fpage>&#x2013;<lpage>504</lpage>.</citation>
</ref>
<ref id="B68">
<label>68.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Structure-embedded ghosting artifact suppression network for high dynamic range image reconstruction</article-title>. <source>Knowledge-Based Syst</source> (<year>2023</year>) <volume>263</volume>:<fpage>110278</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2023.110278</pub-id>
</citation>
</ref>
<ref id="B69">
<label>69.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Mu</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Glioma segmentation-oriented multi-modal mr image fusion with adversarial learning</article-title>. <source>IEEE/CAA J Automatica Sinica</source> (<year>2022</year>) <volume>9</volume>:<fpage>1528</fpage>&#x2013;<lpage>31</lpage>. <pub-id pub-id-type="doi">10.1109/jas.2022.105770</pub-id>
</citation>
</ref>
<ref id="B70">
<label>70.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>A unified framework for damaged image fusion and completion based on low-rank and sparse decomposition</article-title>. <source>Signal Processing: Image Commun</source> (<year>2021</year>) <volume>29</volume>:<fpage>116400</fpage>. <pub-id pub-id-type="doi">10.1016/j.image.2021.116400</pub-id>
</citation>
</ref>
<ref id="B71">
<label>71.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>W</given-names>
</name>
<etal/>
</person-group> <article-title>Target-aware dual adversarial learning and a multi-scenario multi-modality benchmark to fuse infrared and visible for object detection</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2022</year>). p. <fpage>5802</fpage>&#x2013;<lpage>11</lpage>.</citation>
</ref>
<ref id="B72">
<label>72.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Rethinking the effectiveness of objective evaluation metrics in multi-focus image fusion: a statistic-based approach</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2024</year>) <volume>46</volume>:<fpage>5806</fpage>&#x2013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2024.3367905</pub-id>
</citation>
</ref>
<ref id="B73">
<label>73.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xydeas</surname>
<given-names>CS</given-names>
</name>
<name>
<surname>Petrovic</surname>
<given-names>V</given-names>
</name>
<etal/>
</person-group> <article-title>Objective image fusion performance measure</article-title>. <source>Electronics Lett</source> (<year>2000</year>) <volume>36</volume>:<fpage>308</fpage>&#x2013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B74">
<label>74.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Varshney</surname>
<given-names>PK</given-names>
</name>
</person-group>. <article-title>A human perception inspired quality metric for image fusion based on regional information</article-title>. <source>Inf Fusion</source> (<year>2007</year>) <volume>8</volume>:<fpage>193</fpage>&#x2013;<lpage>207</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2005.10.001</pub-id>
</citation>
</ref>
<ref id="B75">
<label>75.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Bovik</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Sheikh</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Simoncelli</surname>
<given-names>E</given-names>
</name>
</person-group>. <article-title>Image quality assessment: from error visibility to structural similarity</article-title>. <source>IEEE Trans Image Process</source> (<year>2004</year>) <volume>13</volume>:<fpage>600</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2003.819861</pub-id>
</citation>
</ref>
<ref id="B76">
<label>76.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>L</given-names>
</name>
<etal/>
</person-group> <article-title>Infrared and visible image fusion: from data compatibility to task adaption</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2024</year>) <fpage>1</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2024.3521416</pub-id>
</citation>
</ref>
<ref id="B77">
<label>77.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Vifb: a visible and infrared image fusion benchmark</article-title>. In: <source>2020 IEEE/CVF conference on computer vision and pattern recognition workshops (CVPRW)</source> (<year>2020</year>). p. <fpage>468</fpage>&#x2013;<lpage>78</lpage>.</citation>
</ref>
<ref id="B78">
<label>78.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Efficient and model-based infrared and visible image fusion via algorithm unrolling</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2021</year>) <volume>32</volume>:<fpage>1186</fpage>&#x2013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2021.3075745</pub-id>
</citation>
</ref>
<ref id="B79">
<label>79.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>A deep learning framework for infrared and visible image fusion without strict registration</article-title>. <source>Int J Computer Vis</source> (<year>2024</year>) <volume>132</volume>:<fpage>1625</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-023-01948-x</pub-id>
</citation>
</ref>
<ref id="B80">
<label>80.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X-J</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Kittler</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Lrrnet: a novel representation learning guided fusion network for infrared and visible images</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2023</year>) <volume>45</volume>:<fpage>11040</fpage>&#x2013;<lpage>52</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2023.3268209</pub-id>
</citation>
</ref>
<ref id="B81">
<label>81.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Cen</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Different input resolutions and arbitrary output resolution: a meta learning-based deep framework for infrared and visible image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2021</year>) <volume>30</volume>:<fpage>4070</fpage>&#x2013;<lpage>83</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2021.3069339</pub-id>
</citation>
</ref>
<ref id="B82">
<label>82.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>A task-guided, implicitly-searched and meta-initialized deep model for image fusion</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2024</year>) <volume>46</volume>:<fpage>6594</fpage>&#x2013;<lpage>609</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2024.3382308</pub-id>
</citation>
</ref>
<ref id="B83">
<label>83.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Menze</surname>
<given-names>BH</given-names>
</name>
<name>
<surname>Jakab</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Bauer</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Kalpathy-Cramer</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Farahani</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Kirby</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>The multimodal brain tumor image segmentation benchmark (brats)</article-title>. <source>IEEE Trans Med Imaging</source> (<year>2015</year>) <volume>34</volume>:<fpage>1993</fpage>&#x2013;<lpage>2024</lpage>. <pub-id pub-id-type="doi">10.1109/tmi.2014.2377694</pub-id>
</citation>
</ref>
<ref id="B84">
<label>84.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mu</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Learning to search a lightweight generalized network for medical image fusion</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2024</year>) <volume>34</volume>:<fpage>5921</fpage>&#x2013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2023.3342808</pub-id>
</citation>
</ref>
<ref id="B85">
<label>85.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Equivariant multi-modality image fusion</article-title>. In: <source>2024 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<year>2024</year>). p. <fpage>25912</fpage>&#x2013;<lpage>21</lpage>.</citation>
</ref>
<ref id="B86">
<label>86.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>A robust mutual-reinforcing framework for 3d multi-modal medical image fusion based on visual-semantic consistency</article-title>. <source>Proc AAAI Conf Artif Intelligence</source> (<year>2024</year>) <volume>38</volume>:<fpage>7087</fpage>&#x2013;<lpage>95</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v38i7.28536</pub-id>
</citation>
</ref>
<ref id="B87">
<label>87.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Dcpnet: a dual-task collaborative promotion network for pansharpening</article-title>. <source>IEEE Trans Geosci Remote Sensing</source> (<year>2024</year>) <volume>62</volume>:<fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2024.3377635</pub-id>
</citation>
</ref>
<ref id="B88">
<label>88.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Single-image hdr reconstruction assisted ghost suppression and detail preservation network for multi-exposure hdr imaging</article-title>. <source>IEEE Trans Comput Imaging</source> (<year>2024</year>) <volume>10</volume>:<fpage>429</fpage>&#x2013;<lpage>45</lpage>. <pub-id pub-id-type="doi">10.1109/tci.2024.3369396</pub-id>
</citation>
</ref>
<ref id="B89">
<label>89.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Generation and recombination for multifocus image fusion with free number of inputs</article-title>. <source>IEEE Trans Circuits Syst Video Technology</source> (<year>2024</year>) <volume>34</volume>:<fpage>6009</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2023.3344222</pub-id>
</citation>
</ref>
<ref id="B90">
<label>90.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>ZJ</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Mm-net: a mixformer-based multi-scale network for anatomical and functional image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2024</year>) <volume>33</volume>:<fpage>2197</fpage>&#x2013;<lpage>212</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2024.3374072</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>