<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1661146</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2025.1661146</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>High-fidelity medical image generation: controllable synthesis of high-resolution medical images via hierarchical fusion in vector-quantized generative networks</article-title>
<alt-title alt-title-type="left-running-head">Tang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2025.1661146">10.3389/fphy.2025.1661146</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Tang</surname>
<given-names>Guangfa</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3061778"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cai</surname>
<given-names>Shanshan</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3028604"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Meng</surname>
<given-names>Xiangjun</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Huo</surname>
<given-names>SiYan</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Mengbo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lu</surname>
<given-names>Zichen</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Zhuokang</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Luo</surname>
<given-names>XiaoLing</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>School of Information and Intelligent Engineering, Guangzhou Xinhua University</institution>, <city>Dongguan</city>, <country country="CN">China</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>College of Computer Science and Software Engineering, Shenzhen University</institution>, <city>Shenzhen</city>, <country country="CN">China</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>Department of Health Industry Research, Dongguan Zhongke Institute of Cloud Computing</institution>, <city>Dongguan</city>, <country country="CN">China</country>
</aff>
<aff id="aff4">
<label>4</label>
<institution>Division of Biomedical and Life Sciences, Faculty of Health and Medicine, Lancaster University</institution>, <city>Lancaster</city>, <country country="GB">United Kingdom</country>
</aff>
<aff id="aff5">
<label>5</label>
<institution>Department of Urology, Dongguan People&#x2019;s Hospital</institution>, <city>Dongguan</city>, <country country="CN">China</country>
</aff>
<aff id="aff6">
<label>6</label>
<institution>Department of Pharmacology and Toxicology, Medical College of Wisconsin</institution>, <city>Milwaukee</city>, <state>WI</state>, <country country="US">United States</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: XiaoLing Luo, <email xlink:href="xlluo@szu.edu.cn">xlluo@szu.edu.cn</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-11-11">
<day>11</day>
<month>11</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1661146</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>13</day>
<month>08</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Tang, Cai, Meng, Huo, Wang, Lu, Chen and Luo.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Tang, Cai, Meng, Huo, Wang, Lu, Chen and Luo</copyright-holder>
<license>
<ali:license_ref start_date="2025-11-11">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Objective</title>
<p>High-resolution medical images are scarce, and existing image generation methods perform poorly at high resolutions, struggling with the representation of small lesions, loss of detailed information, distortion of anatomical structure, high computational cost, and mode collapse. This study aims to develop a novel generative framework to address the challenges of high-resolution medical image generation.</p>
</sec>
<sec>
<title>Methods</title>
<p>Clinical X-ray data from 255 patients and a public dataset containing 1,657 lung CT images with lung nodules were collected. We propose a pioneering medical image generation method that employs a two-route synthesis strategy: a foreground generation route that utilizes a generative model from a single lesion image (SinGAN) to create new lesion configurations and structures while preserving the original patch distribution and a background generation route that utilizes a high-fidelity medical image generation model, high-resolution medical image (HiResMed) Vector-Quantized Generative Adversarial Network (VQGAN), which incorporates a hierarchical dual-path fusion block (HDFB) and integrates it into a VQGAN, trained on the collected data. The HDFB module combines a dual-path learning strategy: a residual path with skip connections to capture hierarchical dependencies and multi-scale textures and a multi-scale convolutional feedforward feature extraction module (MSConvFE) that preserves low-level anatomical features through localized detail enhancement. Finally, based on the location of lesions in historical data as prior knowledge to guide the fusion position of the synthesized lesions in the background image, a high-resolution synthetic medical image with small lesions is obtained. We compared our method with denoising diffusion model (DDM), StyleSwin, VQGAN, and SinGAN using Frechet Inception Distance (FID), learned perceptual image patch similarity (LPIPS), peak signal-to-noise ratio (PSNR), and structural similarity (SSIM). Two urologists participated in a visual Turing test to assess perceptual fidelity.</p>
</sec>
<sec>
<title>Results</title>
<p>The experimental results demonstrate that the proposed method achieves state-of-the-art performance, reducing FID by 43.3% (145.64 vs. 256.11) and LPIPS by 5% (0.48 vs. 0.51), enhancing the PSNR by 4% (59.03 vs. 56.54) and SSIM by 6% (0.67 vs. 0.63), and accelerating training convergence by 83% compared to baseline VQGAN. Clinicians misclassified 55% of synthetic images as real, validating their anatomical fidelity.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>This study proposes a method for generating high-resolution medical images of small lesions. It not only ensures high-quality lesion generation but also allows controls over the number and location of lesions. Moreover, the innovative architecture enhances the detailed quality of anatomical structures and improves computational efficiency during training.</p>
</sec>
</abstract>
<kwd-group>
<kwd>controllable synthesis</kwd>
<kwd>two-route synthesis strategy</kwd>
<kwd>high-resolution medical image generation</kwd>
<kwd>hierarchical dual-path learning</kwd>
<kwd>detail preservation</kwd>
<kwd>high fidelity</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>Basic and Applied Basic Research Foundation of Guangdong Province</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100021171</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp1">2021B1515140038</award-id>
</award-group>
<award-group id="gs2">
<funding-source id="sp2">
<institution-wrap>
<institution>National Natural Science Foundation of China</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100001809</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp2">82274413</award-id>
</award-group>
<award-group id="gs3">
<funding-source id="sp3">
<institution-wrap>
<institution>Natural Science Foundation of Guangdong Province</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100003453</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp3">2025A1515010184</award-id>
</award-group>
<award-group id="gs4">
<funding-source id="sp4">
<institution-wrap>
<institution>Shenzhen Science and Technology Innovation Program</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100017610</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp4">JCYJ20240813141424032</award-id>
</award-group>
<award-group id="gs5">
<funding-source id="sp5">
<institution-wrap>
<institution>Guangdong Province for Science and Technology Innovative Young Talents</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100019337</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp5">2024KQNCX042 2023KQNCX123</award-id>
</award-group>
<funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This research was supported by the Guangdong Basic and Applied Basic Research Foundation under grant no. 2021B1515140038, the National Natural Science Foundation of China (NSFC) under grant no. 82274413, the Key Discipline Research Capacity Enhancement Project of Guangdong Province in 2024 under grant no. 2024ZDJS130, the National Natural Science Foundation of China under Grant 62502320, the Natural Science Foundation of Guangdong Province under Grant 2025A1515010184, the project of Shenzhen Science and Technology Innovation Committee under Grant JCYJ20240813141424032 and JCYJ20240813112420027, Guangdong Basic and Applied Basic Research Foundation under Grant 2024A1515220079, the Foundation for Young innovative talents in ordinary universities of Guangdong under grant no. 2024KQNCX042, and the Young Innovative Talents Project for Ordinary Universities in Guangdong Province in 2023 under grant no. 2023KQNCX123.</funding-statement>
</funding-group>
<counts>
<fig-count count="8"/>
<table-count count="6"/>
<equation-count count="22"/>
<ref-count count="46"/>
<page-count count="16"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Medical Physics and Imaging</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>High-resolution imaging is essential for numerous medical applications, including surgical navigation systems, high-precision diagnostic technologies, and early disease screening. Preoperative path planning for percutaneous nephrolithotomy for kidney stones requires comprehensive X-ray and CT imaging of the entire upper torso [<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>]. Such applications require computation and processing of high-resolution images to provide detailed anatomical information, which is essential for accurate diagnosis and effective surgical planning. However, the limited maturity of high-resolution imaging pipelines, data silos across hospitals, and strict privacy/ethics constraints make case collection and annotation difficult, time-consuming, and expensive [<xref ref-type="bibr" rid="B3">3</xref>&#x2013;<xref ref-type="bibr" rid="B6">6</xref>]. Despite the existence of synthetic data generation methods, existing methods mainly focus on low-resolution medical images of 128 pixels &#xd7; 128 pixels or 256 pixels &#xd7; 256 pixels and rarely exceed 512 pixels &#xd7; 512 pixels [<xref ref-type="bibr" rid="B7">7</xref>], or the generated effects still lack high definition and anatomical fidelity [<xref ref-type="bibr" rid="B8">8</xref>]. Zhao et al. [<xref ref-type="bibr" rid="B9">9</xref>] and Cao et al. [<xref ref-type="bibr" rid="B10">10</xref>] explored transformer-based improvements for high-resolution synthesis, but these methods have not been validated on medical images. The increasing demand for large-scale imaging in various medical fields has gradually exposed the limitations of the existing methods, including high consumption of computing resources, loss of detailed information, and distortion of anatomical structures, making it hard to achieve clinical-grade detail under limited data [<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B12">12</xref>].</p>
<p>Generative adversarial networks (GANs) and their variants provide advanced medical image synthesis [<xref ref-type="bibr" rid="B13">13</xref>&#x2013;<xref ref-type="bibr" rid="B17">17</xref>], yet the adversarial setup often prioritizes global fidelity for fooling the discriminator, which conflicts with the high-dimensional, sparse, and strongly constrained nature of medical images and can lead to mode collapse. Another significant issue is that the GANs often lose detailed information due to some convolution operations, such as downsampling, and their GANs to focus on global distributions. These problems are magnified in medical images where detailed information is particularly important and the resolution is high, resulting in severe distortion of anatomical structures in the reconstructed images and the inability to generate detailed information, such as small lesions and their texture features [<xref ref-type="bibr" rid="B18">18</xref>, <xref ref-type="bibr" rid="B19">19</xref>]. In 2020, the denoising diffusion model (DDM) [<xref ref-type="bibr" rid="B20">20</xref>] achieved improved fidelity but required prohibitive computational resources, had long training and generation times, and could not easily meet the demands of immediate diagnosis. Moreover, high-frequency information is prone to over-smoothing during the denoising step and relies on large-scale, high-quality datasets to accurately learn data distribution, but medical images are usually limited in sample size, multimodal, and exhibit strong domain specificity. More recently, transformer-enhanced GANs, such as StyleSwin [<xref ref-type="bibr" rid="B21">21</xref>], have introduced attention mechanisms to better preserve structural details, but processing high-resolution images produces very long token sequences and incurs high self-attention costs [<xref ref-type="bibr" rid="B22">22</xref>]; the enlarged parameter space also complicates optimization and can yield divergent attention weights between the generator and discriminator, producing structural noise. Additionally, traditional evaluation metrics such as the peak signal-to-noise ratio (PSNR), structural similarity (SSIM) [<xref ref-type="bibr" rid="B23">23</xref>], and other pixel-level indicators cannot evaluate anatomical rationality, and distribution similarity metrics such as Frechet Inception Distance (FID) and IS ignore medical specificity. Therefore, clinical experts are also required to evaluate the diagnostic value of the generated images, but this process is complex and expensive. Consequently, the ability to generate high-resolution medical images with high fidelity has become a crucial research objective [<xref ref-type="bibr" rid="B24">24</xref>&#x2013;<xref ref-type="bibr" rid="B26">26</xref>].</p>
<p>Recent studies have explicitly embedded anatomical or hierarchical priors to improve high-resolution medical image synthesis. Kang [<xref ref-type="bibr" rid="B27">27</xref>] proposed a method that explicitly introduces anatomical structure preservation loss, which significantly improves the consistency of organ contours during cross-domain migration. However, it is still limited to 256 &#xd7; 256 resolution, and small lesion details are easily lost. Yu [<xref ref-type="bibr" rid="B28">28</xref>] proposed a HiFi-Syn, which includes multi-scale discriminators with layered supervision to achieve high-fidelity 512 &#xd7; 512 MRI synthesis with superior structural fidelity to traditional GANs. However, the cascaded network doubles the number of parameters, placing heavy demands on hardware computing resources, and its generalization to non-brain medical image synthesis requires further research. Yu [<xref ref-type="bibr" rid="B29">29</xref>] focused on cross-granular comparative representation of unsupervised lesion segmentation in medical images, and although it has unique explorations in lesion segmentation tasks, it does not involve the medical image generation link, has poor adaptability in multi-modal medical image data fusion scenarios, and cannot be directly applied to high-fidelity medical image synthesis tasks. Efficient-Vector-Quantized Generative Adversarial Network (VQGAN) [<xref ref-type="bibr" rid="B10">10</xref>] introduces a hierarchical transformer module that captures the global anatomical structure and local details through self-attention at different scales. However, the transformer&#x2019;s high computational complexity makes it difficult to process high-resolution images, such as 1,024 &#xd7; 1,024 resolution, and it does not optimize feature weights for the sparsity of medical images, such as small lesions. Although these works demonstrate the value of medical priors, none address the dual challenge of sub-millimeter lesion fidelity and computational tractability at 1,024 pixel resolutions. Our high-resolution medical image (HiResMed)-VQGAN addresses this challenge through parameterized hierarchical fusion, explicitly preserving macro-anatomical structures via residual skip connections and micro-textures via MSConvFE, while reducing computational cost. This approach enables adaptive integration of macroscopic structures, such as spinal morphology, and microscopic lesions, such as pulmonary nodules. Decoupled foreground synthesis enables precise manipulation of lesion characteristics such as size and location, which is impossible in diffusion and transformer frameworks. It achieves the collaborative optimization of &#x201c;high fidelity&#x2013;high efficiency&#x2013;controllability,&#x201d; thus providing a new paradigm for the synthesis of small-sample, high-resolution medical images in clinical practice.</p>
<p>VQGAN [<xref ref-type="bibr" rid="B30">30</xref>] is an advanced generative model proposed at the 2021 IEEE International Conference on Computer Vision and Pattern Recognition, which has demonstrated excellent performance in various applications such as high-resolution image generation, texture synthesis, and video generation, and it provides a partial solution [<xref ref-type="bibr" rid="B31">31</xref>, <xref ref-type="bibr" rid="B32">32</xref>]. The advantage of the network&#x2019;s codebook [<xref ref-type="bibr" rid="B33">33</xref>, <xref ref-type="bibr" rid="B34">34</xref>] discrete calculation mechanism is that it improves computational efficiency, but its disadvantage is that it fails to coordinate multi-scale feature learning, resulting in the inability to simultaneously preserve the macroscopic information of the anatomical structure and the microscopic structural information of tiny lesions. Therefore, the application of VQGAN to high-resolution medical image generation remains underexplored [<xref ref-type="bibr" rid="B33">33</xref>] [<xref ref-type="bibr" rid="B35">35</xref>&#x2013;<xref ref-type="bibr" rid="B38">38</xref>].</p>
<p>In the context of high-resolution medical imaging, which is above 512 &#xd7; 512 pixel images, the challenge is further compounded by the scarcity of cases, especially for small lesions that are critical for early detection of diseases such as kidney stones, early-stage tumors, and nodules. Traditional data augmentation approaches, such as downsampling, have been shown to result in the loss of critical details about small lesions, thereby compromising the quality of synthetic data. This loss of information can lead to suboptimal performance of AI models in detecting and diagnosing diseases at their earliest stages. The need for a novel approach that can generate high-resolution small-lesion medical images while preserving lesion details and maintaining data diversity is, therefore, imperative.</p>
<p>This study introduces a pioneering method that harnesses the power of the single-image generative adversarial network (SinGAN) [<xref ref-type="bibr" rid="B39">39</xref>] model for lesion generation as a foreground synthesis, complemented by an improved VQGAN model for background synthesis. We propose a novel approach to enhance the performance of the VQGAN by introducing a residual convolutional feedforward network module. This module is integrated into the encoder and decoder of a VQGAN framework. Unlike prior works, the hierarchical dual-path fusion block (HDFB) employs a dual-path learning strategy. An MSConvFE path preserves low-level anatomical structures. A residual path utilizes depth-wise convolutions and channel scaling to capture multi-scale textures. This integration accelerates the model&#x2019;s convergence, reducing training time and enhancing the detailed information in the generated high-resolution medical images. This work aims to fill the gap in the current literature and provide a robust solution for high-resolution medical image generation. Our contributions are summarized as follows:<list list-type="order">
<list-item>
<p>Controllable two-route synthesis: We decouple training into a foreground lesion route and a background route and then compose them at inference with explicit control over the lesion size and location. This enables flexible recombination and substantially expands data diversity, which is particularly valuable for rare cases.</p>
</list-item>
<list-item>
<p>HDFB for high-fidelity, efficient background generation: We introduce a dual-path block that combines residual connections for multi-scale texture modeling with an MSConvFE path for low-level anatomical preservation, addressing the fidelity&#x2013;efficiency trade-off at high resolution.</p>
</list-item>
<list-item>
<p>Architectural innovation: To the best of our knowledge, this is the first integration of a hybrid HDFB into a VQGAN encoder&#x2013;decoder for high-resolution medical imaging, improving feature extraction, gradient propagation, computational efficiency, and training speed.</p>
</list-item>
<list-item>
<p>Strong potential for clinical application: Clinicians misjudged 55% of the synthetic images as real images, which proved that the synthetic images had high anatomical fidelity, which strongly verified the feasibility and effectiveness of the framework in clinical application and provided strong support for the application in actual medical scenarios.</p>
</list-item>
</list>
</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2-1">
<label>2.1</label>
<title>Datasets</title>
<p>Our study utilizes a public and a proprietary dataset. The public dataset is LIDC-IDRI, one of the most popular benchmarks in deep learning research, containing 1,657 lung CT images with lung nodules of 512 &#xd7; 512 &#xd7; 3 resolution. The proprietary dataset DGPH-KUB comprises 255 high-resolution kidney&#x2013;ureter&#x2013;bladder (KUB) X-ray images at 3,292 &#xd7; 3,141 resolution collected from the Urology Department of Dongguan People&#x2019;s Hospital. In particular, this study has been authorized by the Ethics Committee of Dongguan People&#x2019;s Hospital (No.: KYKT2022-040). In order to eliminate the influence of other factors on our reported results, image processing software was used to adjust the resolution of the original image, and the images were uniformly changed to a resolution of 1,024 &#xd7; 1,024 &#xd7; 3. This dataset is unique in its focus on high-resolution X-ray images and is particularly valuable for research on kidney stone diagnosis and surgical navigation systems.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>High-resolution medical image VQGAN network</title>
<p>The high-fidelity, high-resolution medical image VQGAN network is proposed as a novel architecture that integrates HDFB into the encoder and decoder of VQGAN. The HDFB proposed in this paper is inserted into the encoder and decoder of VQGAN, and the specific construction method is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Overview of the controllable high-resolution medical image synthesis network. The proposed framework consists of two generation routes: the foreground and background. The foreground uses the SinGAN model to generate lesion images, and the background image is generated using the HiResMed-VQGAN model. Finally, the foreground and background are fused to obtain the result.</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g001.tif">
<alt-text content-type="machine-generated">Diagram illustrating a medical image processing workflow. The foreground route shows SinGAN lesion generation, starting with a single lesion image, passing through a SinGAN model to generate a candidate region with prior position information. The background route involves HiResMed-VQGAN, processing high-resolution medical images with an encoder and decoder to generate a feature map, codebook, and token map, leading to a reconstructed image. Both routes integrate to create an image with lesions, producing a final result through discriminator verification.</alt-text>
</graphic>
</fig>
<p>The generation process is as follows. First, the real high-resolution medical image <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi mathvariant="normal">I</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is input into the HDFB-equipped encoder. The purpose of this process is to perform multi-scale feature extraction and latent space mapping. This process is mainly divided into two stages. The first stage is layered convolutional downsampling, which uses convolutional blocks with residual connections to perform downsampling three times. This process gradually compresses the spatial resolution from 1,024 &#xd7; 1,024 to 16 &#xd7; 16 while increasing the number of channels from 3 to 512, layer by layer, forming a feature pyramid that contains contextual information at different scales. The second stage is the processing of the multi-scale convolutional feedforward feature extraction. Through the parallel structure of depth-wise separable convolution and residual convolution, multi-scale features from the local texture to the global structure are captured, and the features of different branches are fused across scales by element-by-element addition.</p>
<p>The feature map <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> processed by the HDFB-equipped encoder is compressed into a continuous latent space representation through a 1 &#xd7; 1 convolution and then mapped to a discrete codebook space through a vector quantization layer. The codebook is a set of predefined vectors that maps the continuous latent space to the discrete codebook space [<xref ref-type="bibr" rid="B16">16</xref>]. Let <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="normal">B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">b</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">D</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> denote a codebook containing N entries, with each entry <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">b</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> being a D-dimensional trainable embedding with random initialization. These vectors are continuously updated during the training process so that the model can learn a discrete representation to better represent the features of the input image. Subsequently, the quantizer in the codebook maps <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to a token map <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where each token is an entry in B based on the cosine distance between <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and B.</p>
<p>Then, the HDFB-equipped decoder reconstructs the original image from the token map <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. This process is divided into two stages. The first stage is layered deconvolution upsampling. Deconvolution blocks with skip connections are used for upsampling three times to gradually restore the spatial resolution from 16 &#xd7; 16 to 1,024 &#xd7; 1,024, and the number of channels is compressed from 256 to 3 layer by layer. After each upsampling, a residual convolution feedforward network is inserted, and the weights of features of different scales are dynamically adjusted through the channel attention mechanism. The second stage is the detail enhancement convolution calculation. After upsampling the last layer, the high-frequency texture of the reconstruction is captured through the parallel structure of the depth-separable convolution and the residual convolution, and a high-quality reconstructed image is generated.</p>
<p>Finally, the discriminator, composed of two convolutional layers, a normalization operation, and an activation function, calculates the authenticity probability of the real image and the reconstructed image, and it distinguishes the authenticity of local details, medium-scale structures, and global layout of the generated image, respectively. The ultimate goal is to continuously optimize the generator based on the feedback from the discriminator, enabling the generator to produce reconstructed images capable of deceiving the discriminator.</p>
<p>The entire network is optimized using a combination of losses, which is expressed as follows (<xref ref-type="disp-formula" rid="e1">Equation 1</xref>):<disp-formula id="e1">
<mml:math id="m9">
<mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>I</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mi>t</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mi>t</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf9">
<mml:math id="m10">
<mml:mrow>
<mml:mtext>sg</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#xb7;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the stop-gradient operation. <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:mover accent="true">
<mml:mi mathvariant="normal">I</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3b1;</mml:mi>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:mtext>sg</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">&#x3b3;</mml:mi>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:mtext>sg</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mtext>GAN</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the reconstruction loss, quantization loss, VGG-based perceptual loss [<xref ref-type="bibr" rid="B27">27</xref>], and GAN loss [<xref ref-type="bibr" rid="B27">27</xref>], respectively. The hyper-parameters <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are, respectively, set to 1.0 and 0.33 by default.</p>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Hierarchical dual-path fusion block</title>
<p>The HDFB is designed to optimize feature representation and gradient propagation in high-resolution medical image synthesis. The structure is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. By integrating sequential normalization, activation, and multi-sale feature learning with skip connections, the HDFB ensures both anatomical fidelity and computational efficiency. In the HDFB, the input data tensor, which represents the height, width, and number of channels, is first passed through a GroupNorm&#x2013;SiLU pair (<xref ref-type="disp-formula" rid="e2">Equations 2</xref>, <xref ref-type="disp-formula" rid="e3">3</xref>):<disp-formula id="e2">
<mml:math id="m17">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mrow>
<mml:mtext>norm</mml:mtext>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>GroupNorm</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m18">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mrow>
<mml:mtext>act</mml:mtext>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>SiLU</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mrow>
<mml:mtext>norm</mml:mtext>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Structural diagram of HDFB. A residual path utilizes skip connections to capture hierarchical dependencies and multi-scale textures. The multi-scale convolutional feedforward feature extraction module preserves low-level anatomical features through localized detail enhancement.</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g002.tif">
<alt-text content-type="machine-generated">Diagram of a multi-scale convolutional feedforward feature extraction module. It shows a sequence: Feature Maps, GroupNorm for stabilizing training, Conv for localized features, GeLU for gradient preservation, depthwise separable convolutions for capturing textures, and DropPath to prevent overfitting. These lead to MSCONVFE, followed by SiLU, Group Norm, Conv2, and SiLU again, and another Group Norm. The process maintains low-frequency anatomical features.</alt-text>
</graphic>
</fig>
<p>We use a smoothly gated non-linear activation defined as follows (<xref ref-type="disp-formula" rid="e4">Equation 4</xref>):<disp-formula id="e4">
<mml:math id="m19">
<mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mtext>iLU</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf16">
<mml:math id="m20">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the input, and the function is derivable over the whole real number field, which makes the gradient smoother and continuous during the backpropagation process, so there is no problem of gradient disappearance, and it helps improve the stability and convergence speed of training. The non-monotonicity property can, therefore, switch between positive and negative values, providing richer information-processing ability. It can better capture the detailed information of the anatomical structure. SiLU [<xref ref-type="bibr" rid="B40">40</xref>] preserves gradient information better than ReLU, especially for subtle features. After applying 2D convolutional layers to extract local spatial features, we repeat normalization and activation (<xref ref-type="disp-formula" rid="e5">Equations 5</xref>, <xref ref-type="disp-formula" rid="e6">6</xref>), thus amplifying discriminative features while suppressing noise.<disp-formula id="e5">
<mml:math id="m21">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<disp-formula id="e6">
<mml:math id="m22">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>The final output is fed into the MSConvFE block (<xref ref-type="disp-formula" rid="e7">Equation 7</xref>) to enhance multi-scale feature learning:<disp-formula id="e7">
<mml:math id="m23">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>In order to mitigate vanishing gradients and preserve low-frequency anatomical structures, we introduce a skip connection (<xref ref-type="disp-formula" rid="e8">Equation 8</xref>):<disp-formula id="e8">
<mml:math id="m24">
<mml:mrow>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>The residual block, previously used in both the encoder and decoder, was replaced by the HDFB module, resulting in several significant improvements. First, the convolutional feedforward network further analyzes and processes these details by retaining low-level details in residual blocks. In particular, it contains multiple convolutional layers and fully connected layers, and its complex structure can capture finer-grained patterns and relationships in the data. When processing medical images, the convolutional feedforward network can perform an in-depth analysis of details, such as texture and density changes in organs, soft tissues, and bone regions, thereby extracting more subtle features. Through in-depth analysis, this detailed information enables the decoder to reconstruct high-resolution medical images with greater accuracy, thereby enhancing overall network performance in terms of reconstruction quality and generation fidelity. The enhanced feature extraction in the encoder and the improved detail-handling in the decoder result in more accurate reconstructions and higher-quality generated outputs. Second, the HDFB-equipped VQGAN is more robust in terms of noise and input variations. The skip connections in HDFB and its non-linear transformation capabilities help the network to better adapt to different input conditions, which is beneficial in real-world applications where the input data may be corrupted or have diverse characteristics. Third, the combination of HDFB and VQGAN can lead to more efficient training. The HDFB blocks&#x2019; ability to mitigate the vanishing gradient problem and their effective feature processing can accelerate the convergence of the network during training, thus reducing the overall training time and computational resources required.</p>
<p>To enhance the multi-scale feature learning and preserve fine-grained details simultaneously, we design a hybrid architecture for the multi-scale convolutional feedforward feature extraction module, addressing the dual challenges of anatomical coherence and texture fidelity in high-resolution medical image generation. While classical feedforward modules focus on global context aggregation, our MSConvFE uniquely integrates localized detail enhancement, hierarchical multi-scale modeling, and improved computational efficiency through a dual-path structure. The architectural components are shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. Moreover, the 2D convolutional layer can extract local spatial features. Group normalization [<xref ref-type="bibr" rid="B41">41</xref>] divides the channels into several groups, calculates the mean and variance within each group for normalization (<xref ref-type="disp-formula" rid="e9">Equation 9</xref>), and calculates the formula as follows:<disp-formula id="e9">
<mml:math id="m25">
<mml:mrow>
<mml:mrow>
<mml:mtext>GN</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x3bc;</mml:mi>
<mml:mi mathvariant="normal">G</mml:mi>
</mml:msub>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">G</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#xb7;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="normal">&#x3b3;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">&#x3b2;</mml:mi>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf17">
<mml:math id="m26">
<mml:mrow>
<mml:mi mathvariant="normal">G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the number of groups; <inline-formula id="inf18">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x3bc;</mml:mi>
<mml:mi mathvariant="normal">G</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf19">
<mml:math id="m28">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">G</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent the mean and variance of channels in each group, respectively; and <inline-formula id="inf20">
<mml:math id="m29">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf21">
<mml:math id="m30">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represent the parameters of learnable scaling and translation, respectively. This method does not depend on the batch size and helps stabilize the training process. In order to prevent the gradient explosion problem and improve the convergence speed of the model, we introduce a batch normalization operation after SiLU activation function processing. The GeLU activation function introduces nonlinearity through the probability of a Gaussian distribution, and its calculation formula is as follows (<xref ref-type="disp-formula" rid="e10">Equation 10</xref>):<disp-formula id="e10">
<mml:math id="m31">
<mml:mrow>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:mi mathvariant="normal">&#x3a6;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf22">
<mml:math id="m32">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the input, and <inline-formula id="inf23">
<mml:math id="m33">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a6;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the cumulative distribution function of the normal distribution. Due to its high computational complexity, an approximate expression is often used to simplify the calculation [<xref ref-type="bibr" rid="B42">42</xref>].<disp-formula id="e11">
<mml:math id="m34">
<mml:mrow>
<mml:mrow>
<mml:mtext>GeLU</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>0.5</mml:mn>
<mml:mo>&#xb7;</mml:mo>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>tanh</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msqrt>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3c0;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:msqrt>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>0.044715</mml:mn>
<mml:msup>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<p>The nonlinear nature of the GeLU activation function (<xref ref-type="disp-formula" rid="e11">Equation 11</xref>) can enhance the model&#x2019;s ability to fit complex data, and the activation degree of the GeLU function is proportional to the size of the input value, which is helpful for the learning and generalization of the model. In particular, this paper introduces a depth-wise separable convolution layer, which is composed of a depth-wise convolution and a point-wise convolution, where the depth-wise convolution is a convolution operation performed on each channel of the input feature map. Specifically, for an RGB three-channel image, the depth-wise convolution uses three single-channel convolution kernels to convolve the three input channels, respectively, and output the feature maps of the three channels. In this way, the convolution kernel of each channel only needs to process the data of one channel, which greatly reduces the number of parameters and the amount of calculation. Point convolution is a 1 &#xd7; 1 convolution operation applied to the output of the depth-wise convolution to merge the features of different channels. Specifically, the point-wise convolution uses a 1 &#xd7; 1 convolution kernel to convolve the output of the depth-wise convolution, fuses the features of different channels, and generates the final output feature map. Therefore, this combination can not only significantly improve the performance of the model but also optimize the computing resources.</p>
<p>For an input feature map <inline-formula id="inf24">
<mml:math id="m35">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi mathvariant="normal">C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf25">
<mml:math id="m36">
<mml:mrow>
<mml:mi mathvariant="normal">H</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf26">
<mml:math id="m37">
<mml:mrow>
<mml:mi mathvariant="normal">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf27">
<mml:math id="m38">
<mml:mrow>
<mml:mi mathvariant="normal">C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represent the height, width, and number of channels, respectively, the MSConvFE processes the feature as follows (<xref ref-type="disp-formula" rid="e12">Equation 12</xref>):<disp-formula id="e12">
<mml:math id="m39">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="" separators="&#x7c;">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mi>x</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mi>W</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mi>x</mml:mi>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>The skip connection retains raw anatomical features, while the processed branch drop refines high-frequency details. This modification ensures that both high-frequency details and low-frequency features are preserved, which is critical for high-resolution medical image synthesis.</p>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>Lesion synthesis</title>
<p>In this study, the preprocessed lesion images obtained from the previous step serve as the input to the lesion generation model. We employ the SinGAN model for generating synthetic lesion images. SinGAN is a single-image GAN that is particularly well-suited for medical image synthesis tasks where data scarcity is a common challenge. Unlike traditional GANs that require large datasets for effective training, SinGAN can achieve convergence with only a single training image, making it an ideal choice for generating lesion images in scenarios with limited data availability. Therefore, the problem of poor generation quality due to insufficient data volume can be avoided. Moreover, data scarcity and data silos have always been common problems in medical data. The SinGAN model is based on a pyramid of fully convolutional GANs, where each level of the pyramid learns to capture the statistical properties of the input image at different scales. This hierarchical structure enables the model to generate high-quality synthetic images that preserve the fine-grained details of the original lesion. The key advantage of SinGAN lies in its ability to learn from a single image, which is particularly beneficial for medical imaging applications where annotated datasets are often limited. Given a preprocessed lesion image <inline-formula id="inf28">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the SinGAN model generates synthetic lesion images by learning the distribution of the input image across multiple scales. The generation process can be formally described as follows (<xref ref-type="disp-formula" rid="e13">Equation 13</xref>):<disp-formula id="e13">
<mml:math id="m41">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>G</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>where <inline-formula id="inf29">
<mml:math id="m42">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>G</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#xb7;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the SinGAN generator network.</p>
<p>During the training stage, the SinGAN model is trained on a single preprocessed real lesion image, learning a multi-scale representation of its texture and structure. During inference, no real lesion image is fed into the network. Instead, new lesions are synthesized by sampling random noise at the coarsest scale and progressively refining it through the trained scales. This process allows lesion generation to be conditioned solely on the learned internal distribution of the training exemplar without reusing the original image.</p>
<p>To generate high-resolution medical images containing small lesions, the synthetic lesion images <inline-formula id="inf30">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are placed into a high-resolution background image <inline-formula id="inf31">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The background image <inline-formula id="inf32">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is initialized as a zero matrix with the same dimensions as the background high-resolution image <inline-formula id="inf33">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which is the output of the background route. The placement of the synthetic lesions is guided by prior knowledge of lesion locations derived from historical patient data.</p>
<p>To ensure anatomically plausible placement of synthetic lesions within the background image <inline-formula id="inf34">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we introduce a dual-model prior position information framework. This framework combines the location model and danger zone detection model. The location model is a YOLOv11-based detector trained on historical lesion annotations to probabilistically predict likely lesion locations. Formally, for the background image <inline-formula id="inf35">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the model outputs the following set of candidate coordinates (<xref ref-type="disp-formula" rid="e14">Equation 14</xref>):<disp-formula id="e14">
<mml:math id="m49">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where each coordinate represents a high-probability lesion occurrence region learned from historical distributions.</p>
<p>The danger zone detection model is a U-Net segmentation network that is trained to identify anatomically implausible regions (e.g., bones, major vessels, and spinal column in KUB X-rays and pleural surfaces in lung CTs). The model generates a binary mask <inline-formula id="inf36">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, where (<xref ref-type="disp-formula" rid="e15">Equation 15</xref>)<disp-formula id="e15">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="" separators="&#x7c;">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mtext> </mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext> </mml:mtext>
<mml:mi>z</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>:</mml:mo>
<mml:mtext> </mml:mtext>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext> </mml:mtext>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext> </mml:mtext>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mtext> </mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext> </mml:mtext>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext> </mml:mtext>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
</p>
<p>We design a candidate region filtering <inline-formula id="inf37">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for the final candidate region calculation (<xref ref-type="disp-formula" rid="e16">Equation 16</xref>). <inline-formula id="inf38">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is derived by imposing anatomical constraints as follows:<disp-formula id="e16">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
</p>
<p>Users may either manually select the coordinates of interest from these safe regions or allow random sampling to determine the final lesion insertion regions, <inline-formula id="inf39">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mtext>lesion</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Finally, the pixel values of <inline-formula id="inf40">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are filled into <inline-formula id="inf41">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mtext>lesion</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf42">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> contains <inline-formula id="inf43">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> information, and we may also define <inline-formula id="inf44">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as <inline-formula id="inf45">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Specifically, this process ensures that the generated lesions are anatomically plausible and consistent with real-world medical imaging scenarios. The final high-resolution image <inline-formula id="inf46">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> containing the synthetic lesion is obtained by combining <inline-formula id="inf47">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf48">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> using a pixel-wise addition operation, which is as follows (<xref ref-type="disp-formula" rid="e17">Equation 17</xref>):<disp-formula id="e17">
<mml:math id="m65">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<p>In particular, the pixel values in the <inline-formula id="inf49">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mtext>lesion</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> regions of <inline-formula id="inf50">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> need to be set to 0.</p>
<p>Since this merging method is pixel-level, there will inevitably be excessive seams between the edge of the lesion and the background. We leverage Sobel edge detection and Gaussian blurring to achieve natural pixel-level continuity.</p>
<p>First, the contour of the synthetic lesion is extracted using the Sobel operator, which computes gradient magnitudes along both the horizontal and vertical directions to identify edge pixels. This step isolates the boundary between the lesion and its surrounding area, ensuring precise targeting of the transition region. Subsequently, a Gaussian blur (with a kernel size of 3 &#xd7; 3 and standard deviation &#x3c3; &#x3d; 1.0, which is empirically optimized for medical image textures) is applied to the detected edge. This blurring operation creates a gradual intensity transition between the lesion and the background: edge pixels are weighted by a Gaussian distribution, with values smoothly decreasing from the periphery of the lesion to the background.</p>
<p>This approach minimizes abrupt intensity changes at the lesion boundary, thus enhancing the visual coherence of the integrated image without introducing excessive computational overhead.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<sec id="s3-1">
<label>3.1</label>
<title>Implementation details</title>
<p>Weights were initialized using torch.nn.init (mean 0 and standard deviation 0.02), and training was conducted for up to 3,500 epochs. The codebook dimension for vector quantization is selected as 256 to align with the feature dimension of the encoder output. More training hyperparameters are summarized in <xref ref-type="table" rid="T1">Table 1</xref>. All experiments were conducted on a single NVIDIA V100 GPU with 32 GB of memory. We synthesize images at 1,024 &#xd7; 1,024 resolutions for both datasets. Owing to computational constraints, DDM was trained to generate 128 &#xd7; 128 &#xd7; 3 images, which were subsequently upsampled to 1,024 &#xd7; 1,024 &#xd7; 3 for comparison.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Training parameters utilized in the HiResMed-VQGAN model.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Parameter item</th>
<th align="center">Value</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Batch size</td>
<td align="center">8</td>
</tr>
<tr>
<td align="center">Epochs</td>
<td align="center">3,500</td>
</tr>
<tr>
<td align="center">Loss function</td>
<td align="center">Reconstruction loss &#x2b; adversarial loss &#x2b; perceptual loss</td>
</tr>
<tr>
<td align="center">Learning rate</td>
<td align="center">2.25e-05</td>
</tr>
<tr>
<td align="center">Number codebook vectors</td>
<td align="center">1,024</td>
</tr>
<tr>
<td align="center">Optimizer</td>
<td align="center">Adam (eps &#x3d; 1e-08, betas &#x3d; (0.5, 0.9))</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Quantitative evaluation</title>
<p>We used several quantitative metrics to assess the quality of the generated high-resolution medical images, which are detailed as follows.<list list-type="order">
<list-item>
<p>Frechet Inception Distance (FID)</p>
</list-item>
</list>
</p>
<p>The FID [<xref ref-type="bibr" rid="B43">43</xref>] calculates indicators of the quality and diversity of the generated image by comparing the distribution of the generated image with the real image in a specific space. The definition is as follows (<xref ref-type="disp-formula" rid="e18">Equation 18</xref>):<disp-formula id="e18">
<mml:math id="m68">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>where <inline-formula id="inf51">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x3bc;</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf52">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the mean and covariance matrix of real image features, respectively, and <inline-formula id="inf53">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x3bc;</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf54">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi mathvariant="normal">g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the mean and covariance matrix of the generated image features, respectively. <inline-formula id="inf55">
<mml:math id="m73">
<mml:mrow>
<mml:mtext>Tr</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> is the trace of a matrix.<list list-type="order">
<list-item>
<p>Learned perceptual image patch similarity (LPIPS)</p>
</list-item>
</list>
</p>
<p>The LPIPS [<xref ref-type="bibr" rid="B44">44</xref>] is a perceptual similarity measure based on deep learning, which is used to measure the perceptual difference between two images. Its definition is formulated as follows (<xref ref-type="disp-formula" rid="e19">Equation 19</xref>):<disp-formula id="e19">
<mml:math id="m74">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c9;</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>where <inline-formula id="inf56">
<mml:math id="m75">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the lth feature maps. It is normalized with respect to the initial feature map <inline-formula id="inf57">
<mml:math id="m76">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> in the channel dimension using unit normalization, and the number of activated channels is scaled using <inline-formula id="inf58">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x3c9;</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>; the L2 distance value is then calculated. Here, &#x2297; is the dot product operation.<list list-type="order">
<list-item>
<p>Peak signal-to-noise ratio (PSNR).</p>
</list-item>
</list>
</p>
<p>The PSNR measures the pixel-wise similarity between the generated images and the ground truth. The definition is as follows (<xref ref-type="disp-formula" rid="e20">Equation 20</xref>):<disp-formula id="e20">
<mml:math id="m78">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>log</mml:mi>
<mml:mn>10</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mn>2</mml:mn>
<mml:mi>n</mml:mi>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>where n is the number of sampling points. In this study, we process the RGB images, so n &#x3d; 24. MSE stands for the mean squared error, which is defined as follows (<xref ref-type="disp-formula" rid="e21">Equation 21</xref>):<disp-formula id="e21">
<mml:math id="m79">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>H</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>W</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>where H &#xd7; W is the number of pixels in the image, H and W are the length and width of the image, X is the enhanced image, and Y is the real clear image.<list list-type="order">
<list-item>
<p>Structural similarity (SSIM)</p>
</list-item>
</list>
</p>
<p>The similarity between two images is measured from three dimensions: brightness, contrast, and structure. The value range is [0, 1], and the closer the value is to 1, the more similar it is. The calculation formula is as follows (<xref ref-type="disp-formula" rid="e22">Equation 22</xref>):<disp-formula id="e22">
<mml:math id="m80">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf59">
<mml:math id="m81">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represent the local mean, variance, and covariance of the image, respectively.</p>
<p>To rigorously evaluate the stochasticity induced by lesion sampling, we generated five independent samples per test background (using different random seeds) and report the mean &#xb1; standard deviation (SD) in <xref ref-type="table" rid="T2">Table 2</xref>. Three key findings emerged. First, in terms of lesion generation fidelity, on KUB X-ray data, the FID decreased to 145.64 &#xb1; 5.23, which is a significant 43.3% reduction compared to the baseline VQGAN (p &#x3c; 0.001), and the standard deviation (SD &#x3d; 5.23) was the lowest among all the methods, demonstrating optimal generation stability. Although DDM performed well in terms of LPIPS (0.46 &#xb1; 0.03) and PSNR (63.10 &#xb1; 1.27), it failed to generate visible lesions (<xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref>). Our method, on the other hand, successfully synthesized small lesions while preserving the anatomical structure (LPIPS &#x3d; 0.48 &#xb1; 0.02, PSNR &#x3d; 59.03 &#xb1; 0.95). Second, in terms of cross-modal generalization ability, in the CT dataset (LIDC-IDRI), the FID (180.29 &#xb1; 6.87) of this method significantly outperformed all baselines (<italic>p</italic> &#x3c; 0.001), and the PSNR (64.46 &#xb1; 0.84) was the best (<italic>p</italic> &#x3c; 0.05). The variability caused by the prior lesion (LPIPS fluctuation SD &#x2264; 0.02) is far below the human eye perception threshold (LPIPS &#x3e; 0.05 can be perceived [<xref ref-type="bibr" rid="B44">44</xref>]), proving the clinical reliability of the synthesized results. Third, statistical significance was verified by paired t-tests (Bonferroni correction, &#x3b1; &#x3d; 0.05). The FID improvement of this method was significant for all baselines (p &#x3c; 0.001), and PSNR was significantly better than DDM on CT data (<italic>p</italic> &#x3c; 0.05). Due to computational resource limitations, DDM can generate images only at 128 &#xd7; 128 resolution, which must then be upsampled to 1,024 &#xd7; 1,024. This results in high PSNR values while failing to capture true high-resolution details (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Quantitative results (mean &#xb1; SD) compared with state-of-the-art methods on two datasets. Lower FID/LPIPS and higher PSNR/SSIM indicate better performance. The best result is shown in bold, and the second-best result is underlined; significance testing is based on a paired t-test. Results are averaged over five independent samplings per background.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="center">Dataset</th>
<th align="center">FID&#x2193; (mean &#xb1; SD)</th>
<th align="center">LPIPS&#x2193; (mean &#xb1; SD)</th>
<th align="center">PSNR&#x2191; (mean &#xb1; SD)</th>
<th align="center">SSIM&#x2191; (mean &#xb1; SD)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="2" align="center">DDM</td>
<td align="center">DGPH-KUB</td>
<td align="center">178.31 &#xb1; 8.24</td>
<td align="center">
<bold>0.46</bold> &#xb1; <bold>0.03</bold>
</td>
<td align="center">
<bold>63.10</bold> &#xb1; <bold>1.27</bold>
</td>
<td align="center">
<bold>0.69</bold> &#xb1; <bold>0.03</bold>
</td>
</tr>
<tr>
<td align="center">LIDC-IDRI</td>
<td align="center">171.53 &#xb1; 7.91</td>
<td align="center">
<bold>0.46</bold> &#xb1; <bold>0.02</bold>
</td>
<td align="center">63.57 &#xb1; 1.19</td>
<td align="center">0.55 &#xb1; 0.02</td>
</tr>
<tr>
<td rowspan="2" align="center">StyleSwin</td>
<td align="center">DGPH-KUB</td>
<td align="center">243.12 &#xb1; 10.56</td>
<td align="center">0.49 &#xb1; 0.04</td>
<td align="center">62.63 &#xb1; 1.32</td>
<td align="center">0.64 &#xb1; 0.04</td>
</tr>
<tr>
<td align="center">LIDC-IDRI</td>
<td align="center">205.92 &#xb1; 9.83</td>
<td align="center">0.47 &#xb1; 0.03</td>
<td align="center">64.36 &#xb1; 1.08</td>
<td align="center">0.60 &#xb1; 0.06</td>
</tr>
<tr>
<td rowspan="2" align="center">VQGAN</td>
<td align="center">DGPH-KUB</td>
<td align="center">256.11 &#xb1; 12.37</td>
<td align="center">0.51 &#xb1; 0.05</td>
<td align="center">56.54 &#xb1; 1.45</td>
<td align="center">0.63 &#xb1; 0.02</td>
</tr>
<tr>
<td align="center">LIDC-IDRI</td>
<td align="center">280.23 &#xb1; 11.72</td>
<td align="center">0.57 &#xb1; 0.06</td>
<td align="center">61.33 &#xb1; 1.21</td>
<td align="center">0.52 &#xb1; 0.03</td>
</tr>
<tr>
<td rowspan="2" align="center">SinGAN</td>
<td align="center">DGPH-KUB</td>
<td align="center">277.11 &#xb1; 13.15</td>
<td align="center">0.48 &#xb1; 0.04</td>
<td align="center">58.58 &#xb1; 1.52</td>
<td align="center">0.65 &#xb1; 0.05</td>
</tr>
<tr>
<td align="center">LIDC-IDRI</td>
<td align="center">268.07 &#xb1; 12.89</td>
<td align="center">0.47 &#xb1; 0.03</td>
<td align="center">64.00 &#xb1; 1.15</td>
<td align="center">0.59 &#xb1; 0.06</td>
</tr>
<tr>
<td rowspan="2" align="center">Ours</td>
<td align="center">DGPH-KUB</td>
<td align="center">
<bold>145.64</bold> &#xb1; <bold>5.23&#x2605;&#x2605;&#x2605;</bold>
</td>
<td align="center">
<underline>0.48</underline> &#xb1; <underline>0.02</underline>
<bold>&#x2605;&#x2605;</bold>
</td>
<td align="center">59.03 &#xb1; 0.95<bold>&#x2605;&#x2605;</bold>
</td>
<td align="center">
<underline>0.67</underline> &#xb1; <underline>0.03</underline>
<bold>&#x2605;</bold>
</td>
</tr>
<tr>
<td align="center">LIDC-IDRI</td>
<td align="center">
<bold>180.29</bold> &#xb1; <bold>6.87&#x2605;&#x2605;&#x2605;</bold>
</td>
<td align="center">
<underline>0.47</underline> &#xb1; <underline>0.02</underline>
<bold>&#x2605;&#x2605;</bold>
</td>
<td align="center">
<bold>64.46</bold> &#xb1; <bold>0.84&#x2605;</bold>
</td>
<td align="center">
<underline>0.59</underline> &#xb1; <underline>0.03</underline>
<bold>&#x2605;</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="Tfn1">
<p>Significance test: &#x201c;<bold>&#x2605;&#x2605;&#x2605;</bold>&#x201d; represents <italic>p</italic> &#x3c; 0.05; &#x201c;<bold>&#x2605;&#x2605;</bold>&#x201d; represents <italic>p</italic> &#x3c; 0.01; &#x201c;<bold>&#x2605;</bold>&#x201d; represents <italic>p</italic> &#x3c; 0.001 (vs. baselines).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Generation performance of our method compared with the state-of-the-art on DGPH-KUB X-ray images.</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g003.tif">
<alt-text content-type="machine-generated">A grid of twelve grayscale X-ray images shows six different methods for generating medical images. Each row represents an example image, and each column corresponds to a different method: Real, DDM, Styleswin, VQGAN-base, Singan, and Ours. Each column demonstrates varying quality and clarity in the X-ray images, highlighting differences in detail and structure of bones and organs.</alt-text>
</graphic>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Details of the performance of our method compared with the state-of-the-art on DGPH-KUB X-ray images.</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g004.tif">
<alt-text content-type="machine-generated">Comparison of images across different methods: columns represent Real, DDM, Styleswin, VQGAN-base, Singan, and Ours. Rows feature spine, bubble, edge, and lesion quality. Each method shows varying levels of detail and clarity, with &#x22;Ours&#x22; displaying more defined features compared to others.</alt-text>
</graphic>
</fig>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Generation performance of our method compared with the state-of-the-art on LIDC-IDRI data.</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g005.tif">
<alt-text content-type="machine-generated">Comparison of CT scan images across six columns labeled Real, DDM, Styleswin, VQGAN-base, Singan, and Ours. Each column shows three slices of a lung CT scan, illustrating differences in image quality and style produced by various generative models.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Qualitative comparison with state-of-the-art approaches</title>
<p>
<xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref> present KUB X-ray results. Our method synthesizes target images with accurate anatomical structures and fine lesion details. The details of the synthetic KUB X-ray images are displayed in <xref ref-type="fig" rid="F4">Figure 4</xref>. DDM produces structurally reasonable yet overall blurry images and often fails to generate lesion signals. The StyleSwin model produces inferior quality results, and the structure of the spinal cord is unreasonable and unclear. In this comparative experiment, the target map generated by the VQGAN model demonstrates better overall quality but lacks sharp bone edges and clear lesion depiction. The generation effect of SinGAN is not satisfactory, and additionally, the spine is broken, indicating that the model fails to learn global anatomical logic. Overall, our results are visually closest to real images, providing clearer cortical bone boundaries, a more realistic lesion appearance, and more natural representation of intrabody bubbles. Comparison of generation details (<xref ref-type="fig" rid="F4">Figure 4</xref>) shows that our method most closely resembles real images in the synthetic quality of the spine and intrabody bubbles, whereas the results of other methods deviate substantially from realism. A crucial point is that the texture, edge, and clarity of kidney stone lesions generated by the proposed method are superior.</p>
<p>
<xref ref-type="fig" rid="F5">Figures 5</xref>, <xref ref-type="fig" rid="F6">6</xref> show the visual comparison of the generated CT medical images and their details using our proposed method and other state-of-the-art techniques. Similarly, the generation effect of DDM is still vague, and the information on pulmonary blood vessels is not generated. The generation result of StyleSwin is only at a normal level for pulmonary blood vessels but severely distorted for other tissue structures. The texture features generated by the VQGAN model are better than those mentioned above, but the pulmonary vascular information is almost missing. SinGAN generates higher-quality bone and blood vessel information, but it introduces severe distortions in the morphology of other tissues. In our method, both the overall morphology and local texture features, including the vascular features of both lungs and the information about the spine, are very close to those of the real image. Representative synthetic detail information is displayed in <xref ref-type="fig" rid="F6">Figure 6</xref>. The spine generation quality of all the comparison methods is poor and does not reach the level of clinical application.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Details of the performance of our method compared with the state-of-the-art on LIDC-IDRI data.</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g006.tif">
<alt-text content-type="machine-generated">CT images in five columns labeled Real, DDM, Styleswin, VQGAN-base, Singan, and Ours. Rows show the axial plane lung CT, spine and thoracic vertebrae (red ROI), nodules (green ROI), and vein and artery (blue ROI). Each section highlights different regions of interest, illustrating variations in detail generated by different methods.</alt-text>
</graphic>
</fig>
<p>Comparatively, the generation quality of this method and StyleSwin is acceptable and can roughly show the shape of the spinal bone cross-section. When comparing the generation quality of pulmonary nodules, the results of the proposed method and the VQGAN method are closest to real images, whereas other methods produce ground glass-like nodules that lack clarity and suggest a risk of malignant transformation. The last row shows the comparison of the imaging quality of arteries. It is easily observable that the proposed method can clearly generate contours and textures similar to those of real images, while the other methods cannot even present the contours of arteries.</p>
<p>The above results fully confirmed the feasibility of the proposed method in generating X-ray and CT images with high resolution. In particular, compared with the baseline VQGAN effect, the overall quality and details are significantly improved, and the effectiveness of the proposed structural optimization of HDFB is confirmed.</p>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Clinical validation using visual Turing test</title>
<p>To evaluate the perceptual fidelity of synthetic images, we conducted a visual Turing test on 200 KUB X-ray images, where 100 X-rays are real and taken from the DGPH-KUB dataset and the other 100 are images synthesized using our method. Two urologists (senior: 20 years of experience; intermediate: 10 years of experience) take part in this test to complete the authenticity judgment of the 200 KUB X-ray images. As shown in <xref ref-type="table" rid="T3">Table 3</xref>, the true positive of the senior urologist is 85, reflecting familiarity with authentic anatomical features. Nevertheless, the true negative is 45, which means that 55% of the synthetic images were misclassified as real. Our method can mimic clinical data. The false positives of the intermediate urologist are 72, indicating that 72% of the synthetic images were mistaken as real, which validates our method&#x2019;s perceptual fidelity. A further analysis of the clinical implications is shown in <xref ref-type="table" rid="T4">Table 4</xref>. Sensitivity (the true positive rate for real images) and specificity (the true negative rate for synthetic images) were calculated, and statistical significance was assessed using McNemar&#x2019;s [<xref ref-type="bibr" rid="B45">45</xref>] test against random guessing (50%). Inter-rater agreement was quantified using Cohen&#x2019;s kappa (&#x3ba;) [<xref ref-type="bibr" rid="B46">46</xref>]. Senior physicians demonstrated significantly higher sensitivity (85.0% vs. 65.0%, <italic>p</italic> &#x3c; 0.001), reflecting their expertise in familiarity with the characteristics of real KUB X-ray images. However, both groups exhibited critically low specificity (senior: 45.0%; intermediate: 45.0%, <italic>p</italic> &#x3c; 0.01 vs. 50% random guessing), with 55% of the synthetic images being misclassified as real. The low kappa values (0.3 for senior, 0.1 for intermediate) suggest variability in individual judgment criteria, yet the consistent 55% misclassification rate is sufficient to support the validity of the model&#x2019;s ability to generate clinically plausible images.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Confusion matrices of the senior and intermediate urologists in the visual Turing test. The urologists completed the authenticity judgment on 200 KUB X-ray images, which included both real and synthetic images. &#x201c;Synt&#x201d; denotes synthesized images, and P and N indicate positive and negative classes.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Actual 2 urologists prediction</th>
<th colspan="2" align="center">Senior professional</th>
<th colspan="2" align="center">Intermediate professional</th>
</tr>
<tr>
<th align="center">Real(P)</th>
<th align="center">Synt(N)</th>
<th align="center">Real(P)</th>
<th align="center">Synt(N)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Truth Real(P)</td>
<td align="center">85</td>
<td align="center">15</td>
<td align="center">65</td>
<td align="center">35</td>
</tr>
<tr>
<td align="center">Truth Synt(N)</td>
<td align="center">55</td>
<td align="center">45</td>
<td align="center">55</td>
<td align="center">45</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Results of the visual Turing test (n &#x3d; 200). Analysis of the sensitivity, specificity, accuracy, and consistency of the senior and intermediate urologists in the authenticity judgment of synthetic KUB X-ray images.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Metric</th>
<th align="center">Senior professional</th>
<th align="center">Intermediate professional</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Sensitivity (TPR, %)</td>
<td align="center">85.0</td>
<td align="center">65.0</td>
</tr>
<tr>
<td align="center">Specificity (TNR, %)</td>
<td align="center">45.0</td>
<td align="center">45.0</td>
</tr>
<tr>
<td align="center">Accuracy (%)</td>
<td align="center">65.0</td>
<td align="center">55.0</td>
</tr>
<tr>
<td align="center">Kappa (agreement)</td>
<td align="center">0.3</td>
<td align="center">0.1</td>
</tr>
<tr>
<td align="center">
<italic>p</italic>-value (vs. 50%)</td>
<td align="center">&#x3c;0.001</td>
<td align="center">0.046</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>Ablation studies</title>
<sec id="s3-5-1">
<label>3.5.1</label>
<title>Effect of lesion&#x2013;background variability on synthesis stability</title>
<p>To assess the effect of lesion-sampling variability, we analyzed two cases: (1) varying lesion texture, scale, and placement on a fixed background and (2) placing a fixed lesion across varying backgrounds. All metrics were computed per the synthesized image, and the reported values are the mean &#xb1; SD across B &#x3d; 10 backgrounds &#xd7; L &#x3d; 5 lesion samples per background (N &#x3d; 50). As shown in <xref ref-type="table" rid="T5">Table 5</xref>, both scenarios exhibited low metric fluctuations (FID: 145.64&#x2013;147.21; LPIPS: 0.48&#x2013;0.49; PSNR: 58.76&#x2013;59.03; SSIM: 0.68&#x2013;0.69), indicating that the learned prior introduces controlled diversity without compromising visual realism. These variations are below clinical perceptibility thresholds, confirming the method&#x2019;s stability. Some visual examples are shown in <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Variability analysis under fixed lesion/varying background and fixed background/varying lesion conditions. The metrics (mean &#xb1; SD) show minimal fluctuations, confirming robustness against sampling stochasticity.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Metric</th>
<th align="center">Fixed lesion&#x2013;varying background (mean &#xb1; SD)</th>
<th align="center">Fixed background&#x2013;varying lesion (mean &#xb1; SD)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">FID</td>
<td align="center">147.21 &#xb1; 4.56</td>
<td align="center">145.64 &#xb1; 3.82</td>
</tr>
<tr>
<td align="center">LPIPS</td>
<td align="center">0.49 &#xb1; 0.03</td>
<td align="center">0.48 &#xb1; 0.02</td>
</tr>
<tr>
<td align="center">PSNR</td>
<td align="center">58.76 &#xb1; 1.12</td>
<td align="center">59.03 &#xb1; 0.87</td>
</tr>
<tr>
<td align="center">SSIM</td>
<td align="center">0.69 &#xb1; 0.03</td>
<td align="center">0.68 &#xb1; 0.03</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Visual examples of generated images under a fixed lesion with varying backgrounds (left), where each row represents the same lesion image, and a fixed background with varying lesions (right), where each row represents the same background image (Bg is short for background, and the red boxes indicate the lesion images).</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g007.tif">
<alt-text content-type="machine-generated">X-ray images are showing variations in lesion presence and background conditions. The left panel illustrates lesions A, B, and C across different backgrounds (Bg A, B, and C). The right panel maintains a fixed background while altering lesions. Red squares highlight specific lesion areas in each image.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-5-2">
<label>3.5.2</label>
<title>Background fidelity assessment via lesion masking</title>
<p>To evaluate the impact of controlled lesion synthesis on background anatomy fidelity, we conducted a specialized analysis on 100 background samples from the DGPH-KUB test set. The experiments computed two variants: SSIM computed only on pixels outside the lesion mask (background), and SSIM with no mask computed the global pixels. As shown in <xref ref-type="table" rid="T6">Table 6</xref>, the difference between the mask SSIM 0.66 and the global SSIM 0.67 of our proposed method was only 0.01, which is comparable to the difference observed in the baseline VQGAN, demonstrating that controlled lesion insertion did not disrupt the background anatomy. Furthermore, the global SSIM of our proposed method was significantly higher than that of the VQGAN, validating the enhanced background fidelity achieved by the HDFB module. This conclusion demonstrates that the innovative approach in this paper achieves flexible integration of pathological features while maintaining the integrity of the background.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>SSIM comparison for computing outside the lesion mask.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="center">Dataset</th>
<th align="center">SSIM (no mask)</th>
<th align="center">SSIM(mask lesion)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">VQGAN</td>
<td align="center">DGPH-KUB</td>
<td align="center">0.63</td>
<td align="center">0.62</td>
</tr>
<tr>
<td align="center">OUR</td>
<td align="center">DGPH-KUB</td>
<td align="center">0.67</td>
<td align="center">0.66</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-5-3">
<label>3.5.3</label>
<title>Accelerated convergence via HDFB integration</title>
<p>For the ablation experiment, we compared the quantitative results with the benchmark model VQGAN in <xref ref-type="sec" rid="s3-2">Section 3.2</xref> and the visual results in <xref ref-type="sec" rid="s3-3">Section 3.3</xref>, all of which prove the effectiveness of the proposed HDFB module and have a significant effect on the performance improvement of the VQGAN model. In addition, we compared the training loss convergence of our proposed method with that of VQGAN. As shown in <xref ref-type="fig" rid="F8">Figure 8</xref>, integrating the HDFB module into the VQGAN framework leads to faster convergence and more stable training. The training loss of our method decreases more rapidly and reaches a lower value than VQGAN. In addition, the loss of the model with the HDFB block was reduced to 0.1 at approximately the 200th epoch, while the baseline model needed to reach 0.1 at approximately the 1,200th epoch. A total of 10,000 epochs were run in this experiment. Compared with the baseline model, the final loss value of the model with the HDFB block was 0.023, which was 0.057 less than 0.08 of the VQGAN model. This indicates that the HDFB module helps mitigate the vanishing gradient problem and accelerates the training process. This results in reduced training time and improved computational efficiency.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Training loss convergence comparison on the KUB images dataset.</p>
</caption>
<graphic xlink:href="fphy-13-1661146-g008.tif">
<alt-text content-type="machine-generated">Line graph titled &#x22;Training Loss Convergence Comparison&#x22; showing training loss over epochs. The blue line represents VQGAN + HDFB, while the orange line represents VQGAN. Both lines decrease rapidly, with the blue line achieving lower loss values.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>In this study, we proposed a controllable lesion synthesis framework that integrates a SinGAN-based lesion generator with anatomically guided placement and a high-fidelity background synthesis network (HiResMed-VQGAN). The experimental results demonstrate the superiority of the proposed method in generating high-resolution medical images with small lesions. The two-route synthesis strategy addresses a critical bottleneck in medical AI: the scarcity of rare-lesion data. By decoupling SinGAN&#x2019;s lesion generation from background synthesis through HiResMed-VQGAN, our framework achieves flexible lesion control while ensuring high-quality anatomical structures in the generated images and establishes a new benchmark for high-resolution medical image generation, with transformative potential in surgical planning and early-disease detection.</p>
<p>Although the proposed method demonstrates promising results in high-resolution medical image generation, some limitations need to be addressed. In this study, the inter-rater agreement was low (kappa &#x3d; 0.3 for senior, kappa &#x3d; 0.1 for intermediate), corresponding to a fair and slight agreement by Landis and Koch&#x2019;s criteria. This may be due to the intrinsic difficulty of the &#x201c;real vs. synthetic&#x201d; visual judgment task, especially in the absence of standardized evaluation criteria. The results suggest that while sensitivity was relatively high, low specificity and low agreement limit the reliability of purely visual assessments, warranting methodological refinements in future work. Although the evaluation metrics indicate superior perceptual quality, the absence of task-specific evaluation, such as lesion detection or segmentation limits claims, regarding diagnostic fidelity. This is partly mitigated by clinical validation, which shows a high misclassification rate of 55%, indicating that the synthetic lesions are anatomically plausible, and we report SSIM values computed specifically within the lesion mask. Our method achieves SSIM &#x3d; 0.68 &#xb1; 0.03 for lesions, which is significantly higher than that of VQGAN. This objectively confirms that synthetic lesions retain structural similarity to real lesions. Furthermore, low LPIPS variance (SD &#x2264; 0.02) implies perceptual consistency below human-discernible thresholds.</p>
<p>To bridge this gap, the future work will train lesion detectors and segmenters on hybrid datasets to quantify diagnostic utility. Lesion morphological control will be extended by enhancing SinGAN to generate diverse lesion shapes and textures, enabling the synthesis of atypical pathologies. The findings will be validated across modalities, and generalizability will be tested to MRI/PET, where structural constraints differ.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The LIDC-IDRI dataset presented in this article are available at <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/jokerak/lidcidri">https://www.kaggle.com/datasets/jokerak/lidcidri</ext-link>. The DGPH-KUB datasets are not readily available because the data are part of an ongoing study. Requests to access the datasets kindly directed to guangfa_tang@163.com.</p>
</sec>
<sec sec-type="ethics-statement" id="s6">
<title>Ethics statement</title>
<p>The studies involving humans were approved by the Ethics Committee of Dongguan People&#x2019;s Hospital (number: KYKT2022-040). The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>GT: Writing &#x2013; review and editing, Formal analysis, Writing &#x2013; original draft, Methodology, Conceptualization. SC: Writing &#x2013; review and editing, Methodology, Formal analysis, Resources. XM: Formal analysis, Data curation, Validation, Supervision, Writing &#x2013; review and editing, Funding acquisition. SH: Writing &#x2013; review and editing, Formal analysis, Resources, Validation. MW: Visualization, Writing &#x2013; review and editing, Software. ZL: Investigation, Funding acquisition, Writing &#x2013; review and editing. ZC: Formal analysis, Data curation, Writing &#x2013; review and editing, Validation. XL: Formal analysis, Funding acquisition, Writing &#x2013; original draft, Methodology, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/268469/overview">Lipeng Ning</ext-link>, Brigham and Women&#x2019;s Hospital and Harvard Medical School, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1267073/overview">Ziqi Yu</ext-link>, Fudan University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3151249/overview">Jun Lyu</ext-link>, Yantai University, China</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wakabayashi</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Cacciaguerra</surname>
<given-names>AB</given-names>
</name>
<name>
<surname>Abe</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Dalla Bona</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Nicolini</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Mocchegiani</surname>
<given-names>F</given-names>
</name>
<etal/>
</person-group> <article-title>Indocyanine green fluorescence navigation in liver surgery: a systematic review on dose and timing of administration</article-title>. <source>Ann Surg</source> (<year>2022</year>) <volume>275</volume>(<issue>6</issue>):<fpage>1025</fpage>&#x2013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1097/SLA.0000000000005406</pub-id>
<pub-id pub-id-type="pmid">35121701</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Giri</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Puri</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Advancements in navigational bronchoscopy for peripheral pulmonary lesions: a review with special focus on virtual bronchoscopic navigation</article-title>. <source>Front Med</source> (<year>2022</year>) <volume>9</volume>:<fpage>989184</fpage>. <pub-id pub-id-type="doi">10.3389/fmed.2022.989184</pub-id>
<pub-id pub-id-type="pmid">36300190</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Currie</surname>
<given-names>GM</given-names>
</name>
<name>
<surname>Hawk</surname>
<given-names>KE</given-names>
</name>
<name>
<surname>Rohren</surname>
<given-names>EM</given-names>
</name>
</person-group>. <article-title>Generative artificial intelligence biases, limitations and risks in nuclear medicine: an argument for appropriate use framework and recommendations</article-title>. <source>Semin Nucl Med</source> <volume>55</volume> (<year>2024</year>). p. <fpage>423</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1053/j.semnuclmed.2024.05.005</pub-id>
<pub-id pub-id-type="pmid">38851934</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>H&#xf6;lscher</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Reich</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Gut</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Knahl</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Clarke</surname>
<given-names>N</given-names>
</name>
</person-group>. <article-title>Exploring the efficacy and limitations of histogram-based fake image detection</article-title>. <source>Proced Computer Sci</source> (<year>2024</year>) <volume>246</volume>:<fpage>2882</fpage>&#x2013;<lpage>91</lpage>. <pub-id pub-id-type="doi">10.1016/j.procs.2023.102288</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Soni</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Chauhan</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Kaur</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Navigating the realm of generative models: GANs, diffusion, limitations, and future prospects&#x2014;A review</article-title>. In: <source>International conference on cognitive computing and cyber physical systems</source>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer Nature Singapore</publisher-name> (<year>2023</year>).</mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Dwivedi</surname>
<given-names>DN</given-names>
</name>
<name>
<surname>Dwivedi</surname>
<given-names>VN</given-names>
</name>
</person-group>. <article-title>Critiquing the limitations&#x2019; challenges in detecting GAN-generated images with computer vision</article-title>. In: <source>International conference on communication and intelligent systems</source>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer Nature Singapore</publisher-name> (<year>2023</year>). <pub-id pub-id-type="doi">10.1007/978-981-97-2053-8_7</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Uzunova</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ehrhardt</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jacob</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Frydrychowicz</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Handels</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Multi-scale gans for memory-efficient generation of high resolution medical images</article-title>. In: <source>The 22nd international conference on medical image computing and computer assisted intervention (MICCAI)</source>. <publisher-loc>Shenzhen, China</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name> (<year>2019</year>). <pub-id pub-id-type="doi">10.1007/978-3-030-32226-7_6</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khatun</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Yeter-Aydeniz</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Weinstein</surname>
<given-names>YS</given-names>
</name>
<name>
<surname>Usman</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Quantum generative learning for high-resolution medical image generation</article-title>. <source>Machine Learn Sci Technology</source> (<year>2025</year>) <volume>6</volume>(<issue>2</issue>):<fpage>025032</fpage>. <pub-id pub-id-type="doi">10.1088/2632-2153/add1a9</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Metaxas</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Improved transformer for high-resolution gans</article-title>. <source>Adv Neural Inf Process Syst</source> (<year>2021</year>) <volume>34</volume>:<fpage>18367</fpage>&#x2013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2104.11233</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>D</given-names>
</name>
<etal/>
</person-group> <article-title>Efficient-vqgan: towards high-resolution image generation with efficient vision transformers</article-title>. In: <source>Proceedings of the IEEE/CVF international conference on computer vision</source>. <publisher-loc>Paris, France</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2023</year>). p. <fpage>7368</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV42023.2023.00249</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>GH</given-names>
</name>
</person-group>. <article-title>X-ray: a sequential 3d representation for generation</article-title>. <source>Adv Neural Inf Process Syst</source> (<year>2024</year>) <volume>37</volume>:<fpage>136193</fpage>&#x2013;<lpage>219</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2401.13619</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Cong</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Carin</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>On leveraging pretrained gans for generation with limited data</article-title>. In: <conf-name>Proceedings of the 37th International Conference on Machine Learning</conf-name>, Vienna, Austria. <publisher-name>JMLR.org</publisher-name> (<year>2020</year>):<fpage>11340</fpage>&#x2013;<lpage>11351</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2002.07781</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goodfellow</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Pouget-Abadie</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Mirza</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Warde-Farley</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Ozair</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Generative adversarial networks</article-title>. <source>Commun ACM</source> (<year>2020</year>) <volume>63</volume>(<issue>11</issue>):<fpage>139</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1145/3422622</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>GAN review: models and medical image fusion applications</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>91</volume>:<fpage>134</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2022.10.017</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xia</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>JH</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>MH</given-names>
</name>
</person-group>. <article-title>Gan inversion: a survey</article-title>. <source>IEEE Trans pattern Anal machine intelligence</source> (<year>2022</year>) <volume>45</volume>(<issue>3</issue>):<fpage>3121</fpage>&#x2013;<lpage>38</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2022.3181070</pub-id>
<pub-id pub-id-type="pmid">37022469</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dash</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>A review of generative adversarial networks (GANs) and its applications in a wide variety of disciplines: from medical to remote sensing</article-title>. <source>IEEE Access</source> (<year>2023</year>) <volume>12</volume>:<fpage>18330</fpage>&#x2013;<lpage>57</lpage>. <pub-id pub-id-type="doi">10.1109/access.2023.3346273</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>AN</given-names>
</name>
<name>
<surname>Stouffs</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Biljecki</surname>
<given-names>F</given-names>
</name>
</person-group>. <article-title>Generative adversarial networks in the built environment: a comprehensive review of the application of GANs across data types and scales</article-title>. <source>Building Environ</source> (<year>2022</year>) <volume>223</volume>:<fpage>109477</fpage>. <pub-id pub-id-type="doi">10.1016/j.buildenv.2022.109477</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Rombach</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Blattmann</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Lorenz</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Esser</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Ommer</surname>
<given-names>B</given-names>
</name>
</person-group>. <article-title>High-resolution image synthesis with latent diffusion models</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<year>2022</year>). <pub-id pub-id-type="doi">10.1109/CVPR46437.2022.00574</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Polson</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Speier</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Arnold</surname>
<given-names>C</given-names>
</name>
</person-group>. <article-title>High resolution histopathology image generation and segmentation through adversarial training</article-title>. <source>Med Image Anal</source> (<year>2022</year>) <volume>75</volume>:<fpage>102251</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2021.102251</pub-id>
<pub-id pub-id-type="pmid">34814059</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ho</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jain</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P</given-names>
</name>
</person-group>. <article-title>Denoising diffusion probabilistic models</article-title>. <source>Adv Neural Inf Process Syst</source> (<year>2020</year>) <volume>33</volume>:<fpage>6840</fpage>&#x2013;<lpage>51</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2006.11239</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<label>21.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Bao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Styleswin: transformer-Based gan for high-resolution image generation</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<year>2022</year>). <pub-id pub-id-type="doi">10.1109/CVPR.2022.01614</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<label>22.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aboutalebi</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Pavlova</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Gunraj</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Shafiee</surname>
<given-names>MJ</given-names>
</name>
<name>
<surname>Sabri</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Alaref</surname>
<given-names>A</given-names>
</name>
<etal/>
</person-group> <article-title>MEDUSA: multi-scale encoder-decoder self-attention deep neural network architecture for medical image analysis</article-title>. <source>Front Med</source> (<year>2022</year>) <volume>8</volume>:<fpage>821120</fpage>. <pub-id pub-id-type="doi">10.3389/fmed.2021.821120</pub-id>
<pub-id pub-id-type="pmid">35242769</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<label>23.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Martini</surname>
<given-names>MG</given-names>
</name>
</person-group>. <article-title>Measuring objective image and video quality: on the relationship between SSIM and PSNR for DCT-based compressed images</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2025</year>) <volume>74</volume>:<fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2025.3529045</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<label>24.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Sang</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Med&#x2010;SRNet: GAN&#x2010;Based Medical Image Super&#x2010;Resolution via High&#x2010;Resolution Representation Learning</article-title>. <source>Comput. Intell. Neurosci.</source> (<year>2022</year>). <fpage>1744969</fpage>. <pub-id pub-id-type="doi">10.1155/2022/1744969</pub-id>
<pub-id pub-id-type="pmid">35747717</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<label>25.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ni</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>High-resolution medical image reconstruction based on residual neural network for diagnosis of cerebral aneurysm</article-title>. <source>Front Cardiovasc Med</source> (<year>2022</year>) <volume>9</volume>:<fpage>1013031</fpage>. <pub-id pub-id-type="doi">10.3389/fcvm.2022.1013031</pub-id>
<pub-id pub-id-type="pmid">36337881</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<label>26.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>BS</given-names>
</name>
</person-group>. <article-title>3D-DGGAN: a data-guided generative adversarial network for high fidelity in medical image generation</article-title>. <source>IEEE J Biomed Health Inform</source> (<year>2024</year>) <volume>28</volume>(<issue>5</issue>):<fpage>2904</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2024.3367375</pub-id>
<pub-id pub-id-type="pmid">38416610</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<label>27.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kang</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Chikontwe</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Won</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Luna</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>SH</given-names>
</name>
</person-group>. <article-title>Structure-preserving image translation for multi-source medical image domain adaptation</article-title>. <source>Pattern Recognition</source> (<year>2023</year>) <volume>144</volume>:<fpage>109840</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2023.109840</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<label>28.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>HiFi-Syn: hierarchical granularity discrimination for high-fidelity synthesis of MR images with structure preservation</article-title>. <source>Med Image Anal</source> (<year>2025</year>) <volume>100</volume>:<fpage>103390</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2024.103390</pub-id>
<pub-id pub-id-type="pmid">39602984</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<label>29.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Cross-grained contrastive representation for unsupervised lesion segmentation in medical images</article-title>. In: <source>InProceedings of the IEEE/CVF international conference on computer vision 2023</source>. <publisher-loc>Paris, France</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2023</year>). p. <fpage>2347</fpage>&#x2013;<lpage>54</lpage>.</mixed-citation>
</ref>
<ref id="B30">
<label>30.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Esser</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Rombach</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Ommer</surname>
<given-names>B</given-names>
</name>
</person-group>. <article-title>Taming transformers for high-resolution image synthesis</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>. <publisher-loc>Nashville, TN, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2021</year>). <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00376</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<label>31.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Towards accurate image coding: improved autoregressive image generation with dynamic vector quantization</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>. <publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2023</year>). p. <fpage>12873</fpage>&#x2013;<lpage>83</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2023.01893</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<label>32.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Shan</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Shou</surname>
<given-names>MZ</given-names>
</name>
</person-group>. <article-title>Rethinking the objectives of vector-quantized tokenizers for image synthesis</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>. <publisher-loc>Seattle, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2024</year>). p. <fpage>7631</fpage>&#x2013;<lpage>40</lpage>.</mixed-citation>
</ref>
<ref id="B33">
<label>33.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Verma</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Mohan</surname>
<given-names>V</given-names>
</name>
</person-group>. <article-title>Vector quantization loss analysis in VQGANs: a single-GPU ablation study for image-to-image synthesis</article-title>. <source>arXiv preprint arXiv:2308.05242</source> (<year>2023</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2308.05242</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<label>34.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Scaling the codebook size of vqgan to 100,000 with a utilization rate of 99</article-title>. <source>arXiv preprint arXiv:2406.11837</source> (<year>2024</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2406.11837</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<label>35.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhan</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Multimodal image synthesis and editing: a survey</article-title>. <source>arXiv preprint arXiv:2112</source> (<year>2022</year>):<fpage>13592</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2112.13592</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<label>36.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Theobalt</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Regularized vector quantization for tokenized image synthesis</article-title>. In: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>. <publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2023</year>). p. <fpage>18467</fpage>&#x2013;<lpage>76</lpage>.</mixed-citation>
</ref>
<ref id="B37">
<label>37.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Podell</surname>
<given-names>D</given-names>
</name>
<name>
<surname>English</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Lacey</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Blattmann</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Dockhorn</surname>
<given-names>T</given-names>
</name>
</person-group>. <article-title>SDXL: improving latent diffusion models for high-resolution image synthesis</article-title>. <source>arXiv preprint arXiv:2307.01952</source> (<year>2023</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2307.01952</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<label>38.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Enhancing VQGAN performance through integration of multiple vision transformers</article-title>. In: <source>2024 IEEE 8th international conference on vision, image and signal processing (ICVISP)</source>. <publisher-loc>Malaysia: IEEE</publisher-loc>: <publisher-name>Kuala Lumpur</publisher-name> (<year>2024</year>). p. <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1109/ICVISP64524.2024.10959589</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<label>39.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Shaham</surname>
<given-names>TR</given-names>
</name>
<name>
<surname>Dekel</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Michaeli</surname>
<given-names>T</given-names>
</name>
</person-group>. <article-title>Singan: learning a generative model from a single natural image</article-title>. In: <source>Proceedings of the IEEE/CVF international conference on computer vision</source>. <publisher-loc>Seoul, Korea (South)</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2019</year>). p. <fpage>4570</fpage>&#x2013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV.2019.00123</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<label>40.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pappas</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Kovaios</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Moralis-Pegios</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Tsakyridis</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Giamougiannis</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Kirtas</surname>
<given-names>M</given-names>
</name>
<etal/>
</person-group> <article-title>Programmable tanh-elu-sigmoid-and sin-based nonlinear activation functions for neuromorphic photonics</article-title>. <source>IEEE J Selected Top Quan Electronics</source> (<year>2023</year>) <volume>29</volume>(<issue>6</issue>):<fpage>1</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1109/jstqe.2023.3277118</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<label>41.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gunawan</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
</person-group>. <article-title>Understanding and improving group normalization</article-title>. <source>arXiv preprint arXiv:2207.01972</source> (<year>2022</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2207.01972</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<label>42.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hendrycks</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Gimpel</surname>
<given-names>K</given-names>
</name>
</person-group>. <article-title>Gaussian error linear units (gelus)</article-title>. <source>arXiv</source> (<year>2016</year>). <comment>arXiv:1606.08415</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.1606.08415</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<label>43.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Heusel</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Ramsauer</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Unterthiner</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Nessler</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Hochreiter</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Gans trained by a two time-scale update rule converge to a local nash equilibrium</article-title>. In: <source>The thirty-first annual conference on neural information processing systems (NIPS)</source>. <publisher-loc>Long Beach, California</publisher-loc>: <publisher-name>Curran Associates Inc</publisher-name> (<year>2017</year>). p. <fpage>6629</fpage>&#x2013;<lpage>40</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<label>44.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ghildyal</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>F</given-names>
</name>
</person-group>. <article-title>Shift-tolerant perceptual similarity metric</article-title>. In: <source>The 17th European conference on computer vision</source>. <publisher-loc>Tel Aviv, Israel</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2022</year>). p. <fpage>91</fpage>&#x2013;<lpage>107</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-19836-6_6</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<label>45.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>McNemar</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Note on the sampling error of the difference between correlated proportions or percentages</article-title>. <source>Psychometrika</source> (<year>1947</year>) <volume>12</volume>(<issue>2</issue>):<fpage>153</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1007/bf02295996</pub-id>
<pub-id pub-id-type="pmid">20254758</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<label>46.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Altman</surname>
<given-names>DG</given-names>
</name>
</person-group>. <source>Practical statistics for medical research</source>. <publisher-name>Chapman and Hall/CRC</publisher-name> (<year>1990</year>). <pub-id pub-id-type="doi">10.1201/9780429258589</pub-id>
</mixed-citation>
</ref>
</ref-list>
</back>
</article>