<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Cell. Infect. Microbiol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Cellular and Infection Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Cell. Infect. Microbiol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2235-2988</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcimb.2025.1616189</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Multi-omics approaches for image classification in disease diagnosis</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Lin</surname><given-names>Yan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chen</surname><given-names>Shu</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Che</surname><given-names>Jinshan</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3043912/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Sun</surname><given-names>Mingming</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname><given-names>Yuhong</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Critical Care Medicine, Clinical Oncology School of Fujian Medical University, Fujian Cancer Hospital, NHC Key Laboratory of Cancer Metabolism</institution>, <city>Fuzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Gastric Surgery, Clinical Oncology School of Fujian Medical University, Fujian Cancer Hospital, NHC Key Laboratory of Cancer Metabolism</institution>, <city>Fuzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>Fourth Clinical College of Xinxiang Medical College, Xinxiang Central Hospital</institution>, <city>Xinxiang</city>, <state>Henan</state>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Shu Chen, <email xlink:href="mailto:ticzondblink@hotmail.com">ticzondblink@hotmail.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-12-04">
<day>04</day>
<month>12</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>15</volume>
<elocation-id>1616189</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>04</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Lin, Chen, Che, Sun and Wang.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Lin, Chen, Che, Sun and Wang</copyright-holder>
<license>
<ali:license_ref start_date="2025-12-04">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The integration of multi-omics data for disease diagnosis holds transformative potential in the field of computational biology, especially when applied to the intricate and dynamic interactions between microbial communities and their human hosts.</p>
</sec>
<sec>
<title>Methods</title>
<p>This integrative approach enables to capture diverse biological signals across genomic, transcriptomic, proteomic, and metabolomic layers, providing a more comprehensive understanding of disease mechanisms. In alignment with emerging priorities in disease microbiology, our study addresses a critical and timely need for interpretable, scalable, and biologically robust computational models that can extract clinically meaningful diagnostic insights from inherently high-dimensional, heterogeneous, and often incomplete biological datasets.</p>
</sec>
<sec>
<title>Results and Discussion</title>
<p>Traditional image classification approaches in disease contexts&#x2014;such as those relying solely on histopathological features or genomic imaging&#x2014;tend to overlook the broader ecological and systemic dimensions that are essential for decoding the mechanisms of microbial pathogenesis. These single-modal methods often suffer from significant limitations, including reduced scalability to diverse clinical settings, poor generalizability across patient populations, and an inability to handle partially observed or biologically variable data. Such constraints diminish their effectiveness in precision diagnostics, disease subtyping, and therapeutic decision-making. By contrast, our approach emphasizes multi-modal integration and model interpretability, aiming to overcome these limitations and advance the development of next-generation diagnostic tools that are both clinically actionable and biologically grounded.</p>
</sec>
</abstract>
<kwd-group>
<kwd>multi-omics integration</kwd>
<kwd>disease microbiology</kwd>
<kwd>image classification</kwd>
<kwd>host-pathogen dynamics</kwd>
<kwd>equilibrium inference strategy</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that no financial support was received for the research and/or publication of this article.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="8"/>
<equation-count count="34"/>
<ref-count count="54"/>
<page-count count="20"/>
<word-count count="13008"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Clinical and Diagnostic Microbiology and Immunology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>The integration of multi-omics data with imaging has emerged as a promising direction for disease diagnosis, driven by the need for more accurate, early, and individualized medical assessments (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2021a</xref>). Traditional imaging alone often lacks the molecular-level specificity necessary to capture early or subtle pathological changes (<xref ref-type="bibr" rid="B31">Maur&#xed;cio et&#xa0;al., 2023</xref>). Conversely, omics data&#x2014;such as genomics, transcriptomics, proteomics, and metabolomics&#x2014;offer comprehensive biological insights but lack spatial context. Not only does a multi-omics approach provide a more holistic view of disease mechanisms, but it also enhances the interpretability of image-based models through biologically relevant features (<xref ref-type="bibr" rid="B44">Touvron et&#xa0;al., 2021</xref>). Furthermore, fusing omics layers with imaging data enables more robust and generalizable classification systems, particularly critical for complex, multifactorial diseases like cancer and neurodegenerative disorders (<xref ref-type="bibr" rid="B17">Hong et&#xa0;al., 2021</xref>). This convergence supports personalized medicine, as it can tailor diagnosis and treatment strategies to individual patients based on a combination of molecular signatures and visual pathology, thus redefining the landscape of disease classification and prognosis (<xref ref-type="bibr" rid="B43">Tian et&#xa0;al., 2020</xref>).</p>
<p>In the initial phase of development, researchers attempted to link biological and imaging information by formulating expert-driven frameworks that relied on structured associations between annotated image characteristics and known disease features (<xref ref-type="bibr" rid="B16">Hong et&#xa0;al., 2020</xref>). These systems, often built on domain knowledge and manually encoded rules, offered a degree of interpretability and control over diagnostic logic, making them attractive in controlled or narrowly defined clinical contexts (<xref ref-type="bibr" rid="B48">Yang et&#xa0;al., 2021b</xref>). Typically, these frameworks function by aligning predefined imaging phenotypes with curated biological markers, facilitating early attempts at multi-modal integration. However, their performance deteriorated in the face of real-world complexity, where biological data are incomplete, noisy, and not easily described by fixed patterns (<xref ref-type="bibr" rid="B41">Sun et&#xa0;al., 2022</xref>). The brittle nature of rule-based models, coupled with their reliance on comprehensive annotation and expert supervision, made them poorly suited for dynamic or large-scale datasets (<xref ref-type="bibr" rid="B36">Rao et&#xa0;al., 2021</xref>). Moreover, the lack of scalability and adaptability significantly limited their applicability to broader clinical settings, especially when integrating multi-omics layers with heterogeneous structures and missing values was required (<xref ref-type="bibr" rid="B28">Mai et&#xa0;al., 2021</xref>).</p>
<p>To move beyond these limitations, a second wave of approaches employed statistical models and algorithmic pipelines that could learn from examples rather than fixed rules (<xref ref-type="bibr" rid="B45">Wang et&#xa0;al., 2022</xref>). These methods established more flexible paradigms by combining numerical features derived from omics profiles with visual traits extracted from medical images, allowing for semi-automated analysis and classification (<xref ref-type="bibr" rid="B3">Azizi et&#xa0;al., 2021</xref>). By introducing statistical learning principles into diagnostic pipelines, these approaches facilitated pattern recognition across diverse patient populations and provided a foundation for multi-modal fusion (<xref ref-type="bibr" rid="B24">Li et&#xa0;al., 2020</xref>). While this led to measurable gains in predictive power and opened the door to more robust multi-modal analysis, their effectiveness was often constrained by the need for meticulous feature engineering, reliance on domain-specific preprocessing pipelines, and susceptibility to overfitting in high-dimensional settings (<xref ref-type="bibr" rid="B5">Bhojanapalli et&#xa0;al., 2021</xref>). The rigid separation between imaging and omics modalities, combined with challenges in modeling their interactions, hindered the discovery of complex cross-modal relationships necessary for deeper biological insight and clinically meaningful interpretation (<xref ref-type="bibr" rid="B19">Kim et&#xa0;al., 2022</xref>).</p>
<p>In recent years, advances in computational frameworks have ushered in a new generation of integrative models capable of learning hierarchical, cross-modal representations directly from raw, high-dimensional data (<xref ref-type="bibr" rid="B50">Zhang et&#xa0;al., 2020</xref>). By leveraging neural architectures adept at capturing both spatial and contextual information, including convolutional networks, attention mechanisms, and graph-based embeddings, these models facilitate end-to-end learning pipelines that unify imaging and omics data at multiple abstraction levels (<xref ref-type="bibr" rid="B37">Roy et&#xa0;al., 2022</xref>). Such integration allows for the discovery of subtle, non-obvious patterns that are critical for accurate disease classification, outcome prediction, and biomarker identification (<xref ref-type="bibr" rid="B54">Zhu et&#xa0;al., 2020</xref>). Importantly, these models have shown the ability to generalize across diverse cohorts, adapt to missing or partially observed data, and highlight biologically meaningful interactions between cellular structure and molecular function (<xref ref-type="bibr" rid="B8">Chen et&#xa0;al., 2021b</xref>). Despite persistent challenges related to harmonizing data resolution, modality-specific noise, model transparency, and computational resource demands, these integrative systems have demonstrated superior performance and adaptability in diagnosing multifactorial diseases. They mark a significant step toward the realization of truly personalized, multi-modal, and data-integrated medical decision-making in the era of precision health (<xref ref-type="bibr" rid="B2">Ashtiani et&#xa0;al., 2021</xref>).</p>
<p>Based on the limitations of symbolic reasoning, the reliance on handcrafted features in machine learning, and the integration challenges in deep learning, we propose a novel framework that unifies imaging and multi-omics data through a cross-modal attention mechanism and dynamic graph fusion. This approach is designed to address both the heterogeneity and the complementarity of imaging and omics modalities. By aligning visual and molecular information within a shared representational space, our method not only facilitates deeper biological interpretation but also improves model generalizability. Moreover, the proposed system incorporates domain-specific priors to enhance interpretability while retaining the adaptability of end-to-end learning. The motivation behind our method stems from the need to create a cohesive pipeline that overcomes existing bottlenecks in feature alignment, modality fusion, and diagnostic reliability, particularly in real-world, heterogeneous patient datasets. Our framework is designed to integrate comprehensive biological understanding with advanced visual analytics, thereby advancing the field of precision diagnostics.</p>
<list list-type="bullet">
<list-item>
<p>The proposed method introduces a novel cross-modal attention module tailored for aligning multi-omics data with spatial imaging features, enabling more effective fusion of heterogeneous data.</p></list-item>
<list-item>
<p>It features a dynamic graph-based fusion mechanism that adapts to varying data structures, supporting multi-disease scenarios, enhancing computational efficiency, and offering strong generalizability.</p></list-item>
<list-item>
<p>Experimental results on benchmark disease datasets demonstrate superior classification accuracy, robustness across modalities, and improved interpretability compared to existing state-of-the-art methods.</p></list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Integration of multi-omics data</title>
<p>The integration of multi-omics data has emerged as a powerful strategy to enhance disease diagnosis by capturing the biological complexity underlying disease phenotypes (<xref ref-type="bibr" rid="B29">Masana et&#xa0;al., 2020</xref>). Multi-omics data typically include genomics, transcriptomics, proteomics, metabolomics, and epigenomics. Each omics layer offers distinct but complementary information about the molecular state of a biological system. In disease diagnosis, especially for complex diseases such as cancer and neurodegenerative disorders, single-omics approaches often fail to capture the full spectrum of molecular alterations. Integrative approaches aim to combine these heterogeneous data types to achieve more comprehensive insights into disease mechanisms and more accurate predictions (<xref ref-type="bibr" rid="B22">Lahiri et&#xa0;al., 2019</xref>). Data integration strategies can be broadly classified into early integration (concatenation-based), intermediate integration (model-based), and late integration (decision-based) methods. Early integration methods simply concatenate features from each omics layer into a single feature vector, which is then used to train machine learning models. While straightforward, this approach suffers from the curse of dimensionality and potential information redundancy (<xref ref-type="bibr" rid="B52">Zhang et&#xa0;al., 2022</xref>). Intermediate integration strategies use algorithms such as canonical correlation analysis, kernel methods, or deep learning to find shared representations between omics datasets before classification. Late integration methods involve training separate models for each omics layer and then combining their outputs through ensemble methods or voting schemes (<xref ref-type="bibr" rid="B21">Lahiri et&#xa0;al., 2006</xref>). Recent advancements in deep learning have further enabled the development of more sophisticated intermediate integration strategies. For instance, autoencoders and variational autoencoders have been used to learn compact, non-linear embeddings of multi-omics data, which can then be fused with image features for downstream classification. Multi-modal deep learning frameworks that simultaneously process omics and image data streams have demonstrated improved performance in disease classification tasks (<xref ref-type="bibr" rid="B30">Mascarenhas and Agarwal, 2021</xref>). In the context of image classification, multi-omics data often serve as complementary inputs to imaging features extracted from modalities such as MRI, CT, or histopathology (<xref ref-type="bibr" rid="B38">Roy et&#xa0;al., 2023a</xref>). For example, in cancer diagnosis, integrating gene expression data with histopathological images has been shown to improve the prediction of tumor subtypes and patient outcomes. The integration enhances the interpretability of the image features by linking them to molecular pathways and biological processes. The primary challenges in multi-omics integration include data heterogeneity, missing values, and the need for large, annotated datasets. Furthermore, effective feature selection and normalization techniques are critical to mitigate the effects of noise and batch effects. Addressing these challenges requires a combination of domain knowledge, computational innovation, and rigorous validation protocols (<xref ref-type="bibr" rid="B15">He et&#xa0;al., 2025</xref>).</p>
<p>In recent years, a number of computational tools have been developed to address the challenge of multi-omics data integration in disease analysis and patient stratification. Methods such as PINSPlus (<xref ref-type="bibr" rid="B33">Nguyen et&#xa0;al., 2019</xref>) and Cancer Integration via Multikernel Learning (CIMLR) (<xref ref-type="bibr" rid="B35">Ramazzotti et&#xa0;al., 2018</xref>) have demonstrated the utility of combining genomic, transcriptomic, and epigenomic profiles to discover tumor subtypes, using consensus clustering and similarity-based fusion, respectively. More advanced frameworks like iCluF (<xref ref-type="bibr" rid="B39">Shakyawar et&#xa0;al., 2024</xref>) employ iterative cluster-fusion strategies to enhance stability in unsupervised multi-omics integration, while Subtype-GAN (<xref ref-type="bibr" rid="B47">Yang et&#xa0;al., 2021a</xref>) applies generative adversarial networks to improve latent feature extraction across heterogeneous biological data sources. These tools have shown that integrating molecular layers leads to more biologically meaningful subtyping and better clinical interpretability (<xref ref-type="bibr" rid="B18">Kautish et&#xa0;al., 2021</xref>). Beyond cancer subtyping, integration models have also been extended to public health and epidemiological contexts. For example, <xref ref-type="bibr" rid="B40">Shakyawar et&#xa0;al. (2021)</xref> proposed a big data framework that jointly models multi-omics data and comorbidities in the context of COVID-19 and systemic diseases, demonstrating the feasibility of applying integrative analytics beyond oncology. These prior efforts underscore the methodological relevance of multi-modal fusion in complex disease modeling. Building upon this foundation, our study introduces a visual&#x2013;omics framework that complements these integration strategies by embedding visual pathology data into the multi-omics learning process. Compared to existing methods, our approach incorporates spatial, immune, and ecological dynamics, offering a unified probabilistic representation suitable for both classification and latent inference under real-world biological constraints.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Deep learning in medical imaging</title>
<p>Deep learning has revolutionized the field of medical imaging by enabling the automated extraction of hierarchical features from complex image data. Convolutional neural networks (CNNs), in particular, have demonstrated remarkable success in tasks such as image classification, segmentation, and detection across various imaging modalities, including MRI, CT, ultrasound, and digital pathology (<xref ref-type="bibr" rid="B46">Woo et&#xa0;al., 2023a</xref>). The use of deep learning in disease diagnosis has become prevalent due to its ability to learn discriminative features that may not be visible to the human eye (<xref ref-type="bibr" rid="B42">Taori et&#xa0;al., 2020</xref>). Traditional machine learning approaches relied heavily on handcrafted features, which are not only labor-intensive to design but also limited in their ability to capture high-level semantic information. Deep learning models, however, learn feature representations directly from raw image pixels in a data-driven manner. This capability has led to significant improvements in diagnostic accuracy, especially in the classification of diseases such as diabetic retinopathy, lung cancer, Alzheimer&#x2019;s disease, and breast cancer (<xref ref-type="bibr" rid="B34">Peng et&#xa0;al., 2022</xref>). In multi-omics-based image classification, deep learning frameworks can be used to fuse visual and molecular data, leading to more robust and biologically meaningful predictions. Several architectures have been proposed for this purpose, including multi-branch networks where separate CNN branches process image and omics data before combining them in a shared representation space (<xref ref-type="bibr" rid="B23">Lahiri et&#xa0;al., 2012</xref>). Attention mechanisms and graph neural networks have also been utilized to enhance the model&#x2019;s ability to focus on relevant features across different data modalities. Transfer learning and pretrained models play a crucial role in medical imaging, particularly when labeled data are scarce. Models trained on large datasets such as DIBaS can be fine-tuned on medical imaging tasks to achieve better generalization. Data augmentation techniques and synthetic image generation using generative adversarial networks (GANs) have also been employed to overcome data limitations (<xref ref-type="bibr" rid="B4">Bazi et&#xa0;al., 2021</xref>). Despite these advancements, challenges remain in terms of model interpretability, generalization across institutions, and regulatory approval for clinical deployment. Explainable Artificial Intelligence (AI) techniques, such as saliency maps and class activation maps, are increasingly being integrated into deep learning workflows to provide insights into model decisions and enhance clinical trust (<xref ref-type="bibr" rid="B27">Lu et&#xa0;al., 2025</xref>).</p>
<p>In recent years, attention mechanisms have emerged as a transformative component in deep learning models, particularly in the field of medical imaging. Unlike traditional convolutional operations that rely on fixed receptive fields, attention modules dynamically weigh the importance of spatial, channel-wise, or cross-modal features, enabling models to focus on the most informative regions of the input data. This has led to significant improvements in tasks such as lesion localization, fine-grained classification, and modality fusion. For instance, self-attention modules in transformer-based architectures, such as Vision Transformers (ViT), have demonstrated competitive performance in radiology and pathology tasks by capturing long-range dependencies and contextual interactions more effectively than CNNs. Cross-attention designs have been employed to link image features with complementary data modalities, such as gene expression profiles or clinical metadata, supporting more robust diagnosis under heterogeneous data conditions. Recent works have also explored the integration of attention into multi-branch architectures, where image and omics features are processed separately and then aligned via learnable attention weights. These strategies improve the interpretability and adaptability of medical AI models, especially in scenarios with partially missing or noisy modalities. In our work, attention is used as a core mechanism in both spatial feature refinement and modality-level fusion. Specifically, we adopt a cross-modal attention layer that aligns omics-derived embeddings with spatial image features, allowing the model to selectively integrate molecular signals in a region-aware manner. This design choice is supported by the growing body of literature that demonstrates the superiority of attention-based fusion in medical image understanding and multi-modal learning.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Multi-modal fusion techniques</title>
<p>Multi-modal fusion refers to the computational techniques used to integrate data from different modalities to improve the performance and reliability of disease diagnosis models. In the context of multi-omics and image data, fusion techniques are critical for leveraging the complementary nature of visual and molecular information. Effective fusion not only enhances classification performance but also improves biological interpretability and clinical relevance (<xref ref-type="bibr" rid="B6">Bugatti et&#xa0;al., 2025a</xref>). Fusion techniques are generally categorized into three levels, including data, feature, and decision levels. Data-level fusion combines raw data from different sources before any processing, which is rare due to the differences in data structure and dimensionality (<xref ref-type="bibr" rid="B51">Zhang et&#xa0;al., 2025a</xref>). Feature-level fusion extracts features from each modality separately and then concatenates or combines them using methods such as tensor fusion, bilinear pooling, or attention-based mechanisms. Decision-level fusion aggregates the outputs of separate classifiers, using ensemble methods such as majority voting or stacking (<xref ref-type="bibr" rid="B53">Zheng et&#xa0;al., 2022</xref>). Advanced feature-level fusion methods have been developed using deep learning. These include architectures like cross-modal transformers, dual-stream CNNs, and hybrid networks that incorporate recurrent layers or attention modules. Such models are capable of capturing complex relationships between modalities and can dynamically weigh the importance of each modality during classification. Multi-modal fusion has been applied successfully in several disease diagnosis tasks (<xref ref-type="bibr" rid="B6">Bugatti et&#xa0;al., 2025a</xref>). For instance, in glioma classification, combining MRI features with gene expression profiles has led to more accurate predictions of tumor grade and molecular subtype. Similarly, integrating histopathology images with proteomic and transcriptomic data has enhanced the identification of prognostic biomarkers in breast cancer (<xref ref-type="bibr" rid="B10">Dai and Gao, 2021</xref>). An important aspect of multi-modal fusion is the alignment of data across modalities, both temporally and spatially. For example, aligning biopsy samples with corresponding imaging data requires careful annotation and registration techniques. Moreover, data normalization and dimensionality reduction are essential preprocessing steps to ensure that the fused features are compatible and informative (<xref ref-type="bibr" rid="B51">Zhang et&#xa0;al., 2025a</xref>). The success of multi-modal fusion depends on the availability of high-quality, well-annotated datasets, as well as on computational frameworks that can efficiently handle the scale and complexity of multi-omics and imaging data. With continued advancements in data acquisition technologies and computational methods, multi-modal fusion is poised to play a central role in the next generation of precision medicine tools (<xref ref-type="bibr" rid="B12">Dong et&#xa0;al., 2022</xref>).</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Method</title>
<sec id="s3_1">
<label>3.1</label>
<title>Overview</title>
<p>Disease microbiology represents a pivotal field that investigates the interactions between microbial agents and their hosts, focusing on the molecular, cellular, and ecological aspects of infectious diseases. This subsection aims to provide a comprehensive introduction to the methodological framework and conceptual underpinnings that form the basis of our proposed approach. We outline the structure and contributions of the ensuing sections, highlighting their roles in the broader methodology.</p>
<p>We begin by formally defining the core research problem and mathematical structure in Section 3.2. This part introduces a symbolized abstraction of the disease microbiology setting, capturing both host&#x2013;pathogen dynamics and the multi-scale interactions that influence disease progression. The section lays out the notational foundation, integrating principles from epidemiology, molecular biology, and systems microbiology. The mathematical formulations not only enable a deeper understanding of pathogen behaviors under various environmental and immunological contexts but also provide a tractable basis for model development and analytical reasoning. Following the foundational formalism, Section 3.3 introduces a biologically inspired computational architecture tailored for disease microbiology. The new model encapsulates pathogen evolution, host responses, and microenvironmental feedback using a dynamic, structured representation. In contrast to conventional mechanistic or statistical models, our design leverages advances in probabilistic graphical modeling and latent variable learning to simulate the heterogeneous trajectories of infection. It incorporates high-dimensional, multi-modal datasets and is engineered to accommodate partial observations and inherent biological variability. Moreover, it addresses challenges in modeling latent reservoirs, horizontal gene transfer, and cross-species spillover with novel algorithmic components. Subsequently, in Section 3.4, we detail a bespoke inference strategy that facilitates the efficient training and deployment of the proposed model. Recognizing the complexity of microbial interaction networks, this strategy employs an adaptive, energy-based exploration mechanism to identify equilibrium states in host&#x2013;pathogen systems. It integrates domain-specific priors, such as host susceptibility maps and known resistance loci, to guide search trajectories and optimize model fidelity. The strategy is designed to handle large-scale data inputs, including metagenomic sequencing, gene expression profiles, and spatial&#x2013;temporal disease incidence patterns. It further incorporates mechanisms to balance accuracy and generalizability, enabling robust performance across diverse biological systems and disease contexts.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Preliminaries</title>
<p>In this section, we formalize the problem of disease microbiology from a symbolic and mathematical perspective, focusing on the intrinsic dynamics between pathogens, host immune responses, and their surrounding environment. The goal is to construct an abstracted, mathematically tractable representation that captures the complexity of microbial infections and host&#x2013;pathogen interactions over time and space. This formulation provides the foundation upon which our modeling and inference methods are built.</p>
<p>Let <inline-formula>
<mml:math display="inline" id="im1"><mml:mi mathvariant="script">P</mml:mi></mml:math></inline-formula> denote the set of all microbial pathogens under consideration, indexed by <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">P</mml:mi></mml:mrow></mml:math></inline-formula>. Let <inline-formula>
<mml:math display="inline" id="im3"><mml:mi>&#x210b;</mml:mi></mml:math></inline-formula> denote the set of hosts, indexed by <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>&#x210b;</mml:mi></mml:mrow></mml:math></inline-formula>. Each pathogen <inline-formula>
<mml:math display="inline" id="im5"><mml:mi>p</mml:mi></mml:math></inline-formula> is associated with a genomic signature <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>g</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and a set of virulence factors <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>v</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>. Each host <inline-formula>
<mml:math display="inline" id="im8"><mml:mi>h</mml:mi></mml:math></inline-formula> has an immune profile <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msub><mml:mi>i</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and a susceptibility vector <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>.</p>
<p>We denote the infection state of a host <inline-formula>
<mml:math display="inline" id="im11"><mml:mi>h</mml:mi></mml:math></inline-formula> by pathogen <inline-formula>
<mml:math display="inline" id="im12"><mml:mi>p</mml:mi></mml:math></inline-formula> at time <inline-formula>
<mml:math display="inline" id="im13"><mml:mi>t</mml:mi></mml:math></inline-formula> as <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, where <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula> indicates active infection. The probability of infection is governed by a dynamic process (<xref ref-type="disp-formula" rid="eq1">Equation 1</xref>).</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mtext>Pr&#xa0;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>|</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>i</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mi>&#x3d5;</mml:mi><mml:mi>p</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msubsup><mml:msub><mml:mtext>&#x3a8;</mml:mtext><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> is the sigmoid function, <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>&#x3d5;</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is a pathogen-specific coefficient vector, and <inline-formula>
<mml:math display="inline" id="im18"><mml:mrow><mml:msub><mml:mtext>&#x3a8;</mml:mtext><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>concat</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>i</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> is the joint feature mapping.</p>
<p>The overall disease expression level <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mi>&#x211d;</mml:mi></mml:mrow></mml:math></inline-formula> can be modeled (<xref ref-type="disp-formula" rid="eq2">Equation 2</xref>).</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mi>&#x3b2;</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msup><mml:mi>&#x3b3;</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3f5;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im20"><mml:mrow><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>concat</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>i</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>e</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> denotes environmental covariates, and <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:msub><mml:mi>&#x3f5;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x223c;</mml:mo><mml:mi mathvariant="script">N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msup><mml:mi>&#x3c3;</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> represents stochastic noise.</p>
<p>To account for spatial propagation, let <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:mi mathvariant="script">N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> be the set of neighboring hosts. The spatial infection influence <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msub><mml:mi>&#x3b7;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is defined (<xref ref-type="disp-formula" rid="eq3">Equation 3</xref>).</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mi>&#x3b7;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:munder><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>with weights <inline-formula>
<mml:math display="inline" id="im25"><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> encoding contact frequency or proximity between hosts. The weight <inline-formula>
<mml:math display="inline" id="im26"><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represents the interaction strength or proximity between hosts <inline-formula>
<mml:math display="inline" id="im27"><mml:mi>h</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im28"><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:math></inline-formula> and modulates the degree of latent state influence across the spatial host graph. In datasets with geolocation metadata, <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is computed as a Gaussian kernel over the Euclidean distance between sampling coordinates. In clinical or ecological datasets, it can alternatively reflect contextual similarity such as co-hospitalization, shared community, or temporal overlap. This formulation allows the model to encode soft neighborhood-level infection influence even in the absence of explicit contact tracing data.</p>
<p>The growth potential of pathogen <italic>p</italic> at time <italic>t</italic> under nutrient condition <italic>&#x3bd;<sub>t</sub></italic> is captured (<xref ref-type="disp-formula" rid="eq4">Equation 4</xref>).</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>&#x3bc;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>exp&#xa0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msup><mml:mi>&#x3b1;</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>&#x3bd;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x3b4;</mml:mi><mml:mi>p</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>&#x3b1;</italic> is a nutrient responsiveness vector and <italic>&#x3b4;<sub>p</sub></italic> reflects the death rate or competition disadvantage of pathogen <italic>p</italic>. The competition disadvantage term reflects the ecological principle that not all pathogens can coexist with equal likelihood within a host or environment. In our model, it is implemented by adjusting the infection probability of pathogen <italic>p</italic> based on the predicted presence of competing pathogens <italic>p</italic>&#x2032;. A suppression coefficient <italic>&#x3b1;<sub>pp</sub></italic>&#x2032; encodes the strength of disadvantage imposed by <italic>p</italic>&#x2032; on <italic>p</italic>. This mechanism allows the model to favor dominant strains while suppressing less fit or competitively excluded ones, aligning with real-world infection dynamics observed in microbiome and virology studies.</p>
<p>We define the global epidemiological potential &#x39e;<italic><sub>t</sub></italic> over the host population &#x210b; (<xref ref-type="disp-formula" rid="eq5">Equation 5</xref>).</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msub><mml:mtext>&#x39e;</mml:mtext><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>&#x210b;</mml:mi></mml:mrow></mml:munder><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">P</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>&#x3b7;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>&#x3bc;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mn>1</mml:mn><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>which measures the system-wide infection propagation force at time <italic>t</italic>.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>PathoGenesisNet</title>
<p>To capture the multi-scale complexity of host&#x2013;pathogen interactions in disease microbiology, we propose a novel computational architecture termed PathoGenesisNet. This model integrates pathogen genetic determinants, host immune states, environmental mediators, and temporal&#x2013;spatial dynamics through a biologically informed latent state formulation. The architecture is designed to be modular, interpretable, and data-adaptive, and it facilitates analysis across heterogeneous biological systems (as shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>).</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Schematic diagram of PathoGenesisNet. This figure illustrates the multi-stage architecture for latent trajectory inference under ecological, spatial, and resistance constraints. It features dilated convolution layers (with varying dilation rates), deformable transformer blocks, and latent temporal representations. The pipeline integrates environmental dynamics, immune adaptation modules, and attention-based refinement, culminating in resistance-aware regularization that aligns latent embeddings with antimicrobial resistance profiles. The spatial consistency and variational updates are visually encoded, reflecting fixed-point optimization for stable latent refinement and probabilistic infection prediction.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fcimb-15-1616189-g001.tif">
<alt-text content-type="machine-generated">Flowchart depicting a deraining process using a transformation network. It starts with a &#x201c;Rainy Input&#x201d; image and processes through multiple &#x201c;Diltformer Blocks&#x201d; with dimensions changing from H&#xd7;W&#xd7;C to H/8 &#xd7; W/8 &#xd7; 8C. It includes modules for &#x201c;Latent Temporal Representation&#x201d; and &#x201c;Immune Adaptation Modeling&#x201d; using convolution, concatenation, and element-wise addition/multiplication. The final output is the &#x201c;Derained Output&#x201d; image. There are visual cues for operations like dilated convolution, reshaping, and ReLU activation functions.</alt-text>
</graphic></fig>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Latent temporal representation</title>
<p>We model the progression of host&#x2013;pathogen interaction as a dynamic latent state process. At each time step <italic>t</italic>, the latent vector <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mi>d</mml:mi></mml:msup></mml:mrow></mml:math></inline-formula> represents the infection state of host <italic>h</italic> with respect to pathogen <italic>p</italic>. The state evolves under the influence of molecular, spatial, and environmental factors:</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x2131;</mml:mi><mml:mi>&#x3b8;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>&#x2130;</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x3b7;</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3f5;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:mi>&#x2131;</mml:mi><mml:mi>&#x3b8;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> is a learnable fusion function, and <italic>&#x3f5;hpt</italic> &#x223c; <inline-formula>
<mml:math display="inline" id="im32"><mml:mi mathvariant="script">N</mml:mi></mml:math></inline-formula> (0,&#x3a3;) introduces biological variability. The fusion function is instantiated as a multi-modal update layer:</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mi>&#x2131;</mml:mi><mml:mi>&#x3b8;</mml:mi><mml:mo>=</mml:mo><mml:mtext>tanh&#xa0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>z</mml:mi></mml:msub><mml:mi>Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>g</mml:mi></mml:msub><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mi>I</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>e</mml:mi></mml:msub><mml:mi>&#x2130;</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mi>W</mml:mi><mml:mi>&#x3b7;</mml:mi><mml:msub><mml:mi>&#x3b7;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>W</italic><sub>&#x2217;</sub> are trainable projections. This allows the latent state to incorporate temporal continuity and cross-modal context. The latent state is decoded into an infection probability through a Bernoulli likelihood:</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mtext>Pr</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>|</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mi>u</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>providing interpretable probabilistic outputs for binary infection events. To support memory-dependent dynamics such as chronic conditions or latency, we integrate a gated update:</p>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x3b4;</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3b4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xb7;</mml:mo><mml:mi>&#x2131;</mml:mi><mml:mi>&#x3b8;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x3f5;</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>&#x3b4;</italic> &#x2208; [0,1] is a learnable decay factor. This formulation blends short-term and long-term dynamics adaptively. To ensure diversity among pathogen trajectories, we apply a dissimilarity loss: (<xref ref-type="disp-formula" rid="eq10">Equation 10</xref>).</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:mi>&#x2112;</mml:mi><mml:mtext>inter</mml:mtext><mml:mo>=</mml:mo><mml:mo>&#x2211;</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>p</mml:mi><mml:mo>&#x2260;</mml:mo><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo>&#x22a4;</mml:mo></mml:msubsup><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi mathvariant="double-struck">I</mml:mi><mml:mrow><mml:mtext>dissimilar</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>which discourages convergence of embeddings across ecologically distinct pathogen classes.</p>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Immune adaptation modeling</title>
<p>We further introduce a biologically inspired gating mechanism to reflect host immune memory. For each host <italic>h</italic>, the immune gate <italic>m<sub>ht</sub></italic> modulates susceptibility at time <italic>t</italic> based on prior exposures: (<xref ref-type="disp-formula" rid="eq11">Equation 11</xref>).</p>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:msub><mml:mi>m</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msubsup><mml:mi>V</mml:mi><mml:mi>i</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msubsup><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">P</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>V<sub>i</sub></italic> is a learnable vector and <italic>G<sub>p</sub></italic> is the genotype embedding of pathogen <italic>p</italic>. The output <italic>m<sub>ht</sub></italic> &#x2208; [0, 1] reflects the priming level of the host. The infection probability is then reweighted (<xref ref-type="disp-formula" rid="eq12">Equation 12</xref>):</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mi>m</mml:mi><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mo>&#xb7;</mml:mo><mml:mtext>Pr&#xa0;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>|</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>so that previously encountered pathogens induce a weaker infection response. Genomic embeddings are computed from pathogen sequences using a CNN encoder (<xref ref-type="disp-formula" rid="eq13">Equation 13</xref>):</p>
<disp-formula id="eq13"><label>(13)</label>
<mml:math display="block" id="M13"><mml:mrow><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>CNN</mml:mtext></mml:mrow><mml:mi>&#x3d5;</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>Seq</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>which captures local motifs and structure within the nucleotide sequences. We also define a memory cell <italic>M<sub>ht</sub></italic> to store long-term immune information (<xref ref-type="disp-formula" rid="eq14">Equation 14</xref>):</p>
<disp-formula id="eq14"><label>(14)</label>
<mml:math display="block" id="M14"><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x3c1;</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3c1;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xb7;</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">P</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>with attention weights computed as follows (<xref ref-type="disp-formula" rid="eq15">Equation 15</xref>):</p>
<disp-formula id="eq15"><label>(15)</label>
<mml:math display="block" id="M15"><mml:mrow><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>exp</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>u</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mo>&#x2211;</mml:mo><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:msub><mml:mtext>exp</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>u</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>allowing the model to prioritize pathogen-specific signals during immune adaptation (<xref ref-type="disp-formula" rid="eq16">Equation 16</xref>).</p>
</sec>
<sec id="s3_3_3">
<label>3.3.3</label>
<title>Environmental dynamics integration</title>
<p>Environmental factors play a crucial role in shaping host&#x2013;pathogen dynamics, influencing both microbial activity and host immune responses. To capture these effects, we introduce a learned transformation over raw environmental covariates <inline-formula>
<mml:math display="inline" id="im33"><mml:mi>&#x2130;</mml:mi></mml:math></inline-formula><italic><sub>hpt</sub></italic>, which may include features such as ambient temperature, humidity, pH, local microbiota composition, or toxin presence. These variables are processed through a filter network <inline-formula>
<mml:math display="inline" id="im34"><mml:mi mathvariant="script">G</mml:mi></mml:math></inline-formula><italic><sub>&#x3c8;</sub></italic> that learns context-specific importance weights and non-linear relationships (as shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>).</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Schematic diagram of environmental dynamics integration. This figure presents a multi-modal framework for ecological modeling that leverages frozen language and vision transformer layers. Textual and visual embeddings from environmental and clinical sources are processed through modality-specific frozen transformers and aligned in a shared latent space. The outputs are fused via a multi-modal integration stage, enabling joint reasoning over heterogeneous inputs. Environmental dynamics are incorporated at the fusion level, facilitating informed infection trajectory estimation under data-scarce conditions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fcimb-15-1616189-g002.tif">
<alt-text content-type="machine-generated">Diagram of a fusion stage in a multimodal model. It includes a Frozen Language Transformer Layer and a Frozen Vision Transformer Layer. Both layers process inputs labeled cls and numbered sequentially, integrating components z_cp and f_cp with environmental dynamics. Arrows indicate data flow within and between layers.</alt-text>
</graphic></fig>
<disp-formula id="eq16"><label>(16)</label>
<mml:math display="block" id="M16"><mml:mrow><mml:msubsup><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="script">G</mml:mi><mml:mi>&#x3c8;</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>ReLU</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>&#x2130;</mml:mi></mml:msub><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im35"><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>&#x2130;</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is a learnable matrix and <inline-formula>
<mml:math display="inline" id="im36"><mml:mi>d</mml:mi></mml:math></inline-formula> is a bias term. The Rectified Linear Unit (ReLU) activation introduces non-linearity while preserving sparsity and directional influence. The transformed environmental signal <inline-formula>
<mml:math display="inline" id="im37"><mml:mrow><mml:msubsup><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula> is then concatenated with latent host&#x2013;pathogen representations and integrated into the latent state evolution module, enabling dynamic environmental modulation of infection progression.</p>
<p>To preserve temporal coherence and ensure biologically plausible trajectories, we impose a smoothness constraint on the latent dynamics. This regularization penalizes abrupt changes in the latent infection state unless justified by the model inputs (<xref ref-type="disp-formula" rid="eq17">Equation 17</xref>).</p>
<disp-formula id="eq17"><label>(17)</label>
<mml:math display="block" id="M17"><mml:mrow><mml:msub><mml:mi>&#x211b;</mml:mi><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:mrow><mml:mrow><mml:mo>&#x2016;</mml:mo><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x2016;</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>which encourages gradual transitions in the health-infection embedding over time, consistent with biological progression and immune regulation processes.</p>
<p>In addition, we incorporate a diversity regularization to prevent latent space collapse and enforce distinctiveness among pathogen embeddings. This constraint is designed to ensure that pathogen representations remain sufficiently dissimilar, especially across different taxonomic or functional groups (<xref ref-type="disp-formula" rid="eq18">Equation 18</xref>).</p>
<disp-formula id="eq18"><label>(18)</label>
<mml:math display="block" id="M18"><mml:mrow><mml:msub><mml:mi>&#x211b;</mml:mi><mml:mrow><mml:mtext>div</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle displaystyle="true"><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2260;</mml:mo><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:munder><mml:mrow><mml:mi>exp</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mo>&#x2016;</mml:mo><mml:mrow><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:msub></mml:mrow><mml:mo>&#x2016;</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where high similarity between unrelated pathogens is penalized exponentially, maintaining robustness and expressiveness in the genotype embedding space.</p>
<p>The likelihood of observing the infection data across the entire host&#x2013;pathogen population is aggregated as the total evidence term (<xref ref-type="disp-formula" rid="eq19">Equation 19</xref>).</p>
<disp-formula id="eq19"><label>(19)</label>
<mml:math display="block" id="M19"><mml:mrow><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>&#x210b;</mml:mi></mml:mrow></mml:munder><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">P</mml:mi></mml:mrow></mml:munder><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:mtext>log&#xa0;Pr</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>which measures how well the model&#x2019;s latent representations account for observed infection patterns. To train the model, we define a joint objective that balances evidence maximization with regularization penalties (<xref ref-type="disp-formula" rid="eq20">Equation 20</xref>).</p>
<disp-formula id="eq20"><label>(20)</label>
<mml:math display="block" id="M20"><mml:mrow><mml:mi mathvariant="script">J</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:msub><mml:mi>&#x211b;</mml:mi><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:msub><mml:mi>&#x211b;</mml:mi><mml:mrow><mml:mtext>div</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>&#x3bb;</italic><sub>1</sub> and <italic>&#x3bb;</italic><sub>2</sub> are hyperparameters that modulate the trade-off between predictive performance, temporal continuity, and latent diversity.</p>
</sec>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Microbial equilibrium search strategy</title>
<p>To effectively deploy the proposed PathoGenesisNet framework in complex biological environments, we introduce a novel inference strategy termed microbial equilibrium search strategy (MESS). This strategy is designed to iteratively approximate the latent microbe&#x2013;host system&#x2019;s equilibrium states by leveraging domain-informed priors, biological constraints, and spatiotemporal observations. MESS integrates ideas from variational inference, energy-based optimization, and population ecology to form a biologically grounded inference mechanism (as shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>).</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Schematic diagram of microbial equilibrium search strategy (MESS). This figure depicts an integrated architecture combining image and text encoders with multi-stage attention mechanisms for infection trajectory modeling. The system extracts visual and textual features through parallel encoders, applies self-attention and cross-modal attention (VFM Attention), and incorporates early fusion strategies. Downstream modules implement fine-grained compensation and domain-informed regularizations, including trajectory-based posterior inference, spatial consistency enforcement, and resistance-aware adjustments, yielding biologically coherent segmentation and prediction outputs.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fcimb-15-1616189-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating three processes: The top process shows an image encoder leading to self-self attention, resulting in a segmentation map connecting to a text encoder. The middle process involves an image encoder linked to VFM attention, also producing a segmentation map for a text encoder. The bottom process represents ecological and spatial consistency, with an image encoder and text encoder interacting through early-fused attention, creating a coarse map leading to fine-grained compensation, annotated with resistance-aware regularization.</alt-text>
</graphic></fig>
<sec id="s3_4_1">
<label>3.4.1</label>
<title>Trajectory-based posterior formulation</title>
<p>To perform inference over latent host&#x2013;pathogen dynamics, we model the progression of infection states and underlying latent variables across time for each host&#x2013;pathogen pair. Let the latent trajectory be represented by (<xref ref-type="disp-formula" rid="eq21">Equation 21</xref>).</p>
<disp-formula id="eq21"><label>(21)</label>
<mml:math display="block" id="M21"><mml:mi mathvariant="script">Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mo>=</mml:mo><mml:mi>Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mi mathvariant="script">X</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi><mml:mo>,</mml:mo></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im38"><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mi>d</mml:mi></mml:msup></mml:mrow></mml:math></inline-formula> encodes the latent physiological state and <inline-formula>
<mml:math display="inline" id="im39"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula> indicates binary infection status. We observe only a subset of these infection states, <inline-formula>
<mml:math display="inline" id="im40"><mml:mrow><mml:mi mathvariant="script">X</mml:mi><mml:mi>h</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msup><mml:mo>&#x2282;</mml:mo><mml:mi mathvariant="script">X</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:math></inline-formula>, and aim to infer the full latent trajectory <inline-formula>
<mml:math display="inline" id="im41"><mml:mrow><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> that aligns with biological dynamics and observed data. We define an energy-based probabilistic model over trajectories:</p>
<disp-formula id="eq22"><label>(22)</label>
<mml:math display="block" id="M22"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="script">Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mo>|</mml:mo><mml:mi mathvariant="script">X</mml:mi><mml:mi>h</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x398;</mml:mtext><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x221d;</mml:mo><mml:mtext>exp&#xa0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>&#x2130;</mml:mi><mml:mtext>dyn</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi mathvariant="script">Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>&#x2130;</mml:mi><mml:mtext>cons</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi mathvariant="script">Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="script">X</mml:mi><mml:mi>h</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>&#x2130;</mml:mi><mml:mtext>host</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where &#x398; denotes model parameters. The energy <inline-formula>
<mml:math display="inline" id="im42"><mml:mi>&#x2130;</mml:mi></mml:math></inline-formula> dyn enforces temporal consistency, <inline-formula>
<mml:math display="inline" id="im43"><mml:mi>&#x2130;</mml:mi></mml:math></inline-formula> cons ensures agreement with observed infection labels, and <inline-formula>
<mml:math display="inline" id="im44"><mml:mi>&#x2130;</mml:mi></mml:math></inline-formula><sub>host</sub> incorporates host-specific priors. The dynamic energy is defined as follows:</p>
<disp-formula id="eq23"><label>(23)</label>
<mml:math display="block" id="M23"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mi>&#x2130;</mml:mi><mml:mtext>dyn</mml:mtext><mml:mo>=</mml:mo><mml:mo>&#x2211;</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>t</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:msup><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:msup><mml:msup><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>&#x2131;</mml:mi><mml:mi>&#x3b8;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>&#x2130;</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x3b7;</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im45"><mml:mrow><mml:mi>&#x2131;</mml:mi><mml:mi>&#x3b8;</mml:mi></mml:mrow></mml:math></inline-formula> is a non-linear transition function, <inline-formula>
<mml:math display="inline" id="im46"><mml:mrow><mml:msub><mml:mi>G</mml:mi><mml:mi>p</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the pathogen embedding, <inline-formula>
<mml:math display="inline" id="im47"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the host information, <inline-formula>
<mml:math display="inline" id="im48"><mml:mrow><mml:mi>&#x2130;</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:math></inline-formula> is the environmental input, and <inline-formula>
<mml:math display="inline" id="im49"><mml:mrow><mml:msub><mml:mi>&#x3b7;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> denotes spatial influence. Observed infection status is incorporated via</p>
<disp-formula id="eq24"><label>(24)</label>
<mml:math display="block" id="M24"><mml:mrow><mml:mi>&#x2130;</mml:mi><mml:mtext>cons</mml:mtext><mml:mo>=</mml:mo><mml:mo>&#x2211;</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:msup><mml:mn>1</mml:mn><mml:mi>x</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:msup><mml:mi>t</mml:mi><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msup><mml:mo>&#xa0;</mml:mo><mml:mtext>exists</mml:mtext><mml:mo>&#xb7;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mtext>log&#xa0;</mml:mtext><mml:mi>&#x3c3;</mml:mi><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>&#x3c3;</italic>(&#xb7;) is the sigmoid function, and <italic>w</italic> and <italic>b</italic> are emission parameters mapping latent states to infection likelihood. Host-specific regularization is defined as follows:</p>
<disp-formula id="eq25"><label>(25)</label>
<mml:math display="block" id="M25"><mml:mrow><mml:mi>&#x2130;</mml:mi><mml:mtext>host</mml:mtext><mml:mo>=</mml:mo><mml:mi>&#x3bb;</mml:mi><mml:mo>&#x2211;</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:msup><mml:msup><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mover accent="true"><mml:mi>Z</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext>baseline</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im50"><mml:mrow><mml:msubsup><mml:mover accent="true"><mml:mi>Z</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext>baseline</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> is the baseline latent state estimated from population-level profiles, and <inline-formula>
<mml:math display="inline" id="im51"><mml:mi>&#x3bb;</mml:mi></mml:math></inline-formula> controls the regularization strength. The final inference objective is framed as an energy minimization problem:</p>
<disp-formula id="eq26"><label>(26)</label>
<mml:math display="block" id="M26"><mml:mrow><mml:mi mathvariant="script">Z</mml:mi><mml:mi>h</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mo>*</mml:mo></mml:msup><mml:mo>=</mml:mo><mml:mi>arg</mml:mi><mml:mi>min</mml:mi><mml:mi mathvariant="script">Z</mml:mi><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mi>&#x2130;</mml:mi><mml:mtext>dyn</mml:mtext><mml:mo>+</mml:mo><mml:mi>&#x2130;</mml:mi><mml:mtext>cons</mml:mtext><mml:mo>+</mml:mo><mml:mi>&#x2130;</mml:mi><mml:mtext>host</mml:mtext></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>This approach yields biologically consistent latent trajectories that integrate temporal dynamics, partial observations, and prior knowledge.</p>
</sec>
<sec id="s3_4_2">
<label>3.4.2</label>
<title>Ecological and spatial consistency</title>
<p>To incorporate ecological dependencies and spatial interactions, we introduce a spatial diffusion regularizer that enforces local consistency among neighboring hosts. Let <inline-formula>
<mml:math display="inline" id="im52"><mml:mrow><mml:mi mathvariant="script">N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> denote the spatial neighborhood of host <inline-formula>
<mml:math display="inline" id="im53"><mml:mi>h</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im54"><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represent an affinity weight based on spatial proximity or ecological similarity (as shown in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>).</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Schematic diagram of ecological and spatial consistency. This figure illustrates a hierarchical encoder&#x2013;decoder network designed for ecological modeling and spatial consistency enforcement. The architecture includes multiple Specialized Convolutional Neural Network Block (SCNN) and general convolution blocks, interleaved with multi-scale attention (MSA) gates, enabling rich feature extraction across resolutions. Skip connections and upsampling operations facilitate fine-grained reconstruction, while spatial priors are integrated to guide segmentation outputs. The model emphasizes ecological coherence and latent alignment across spatial hierarchies, aligning with the broader resistance-aware, trajectory-based formulation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fcimb-15-1616189-g004.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a neural network model for ecological and spatial consistency. It begins with an SCNN Block, followed by several General Conv Blocks and SCNN Blocks, interconnected by skip connections. MSA Gates are used to direct the flow, with details about dimensions and operations specified. A small inset image shows a person holding a board labeled &#x201c;Diagnosis&#x201d;.</alt-text>
</graphic></fig>
<disp-formula id="eq27"><label>(27)</label>
<mml:math display="block" id="M27"><mml:mrow><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>spatial</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:mrow><mml:mstyle displaystyle="true"><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:munder><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:mrow><mml:mo>&#x2016;</mml:mo><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:msup><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x2016;</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>This term encourages latent states of spatially adjacent hosts to remain similar at each time point, reflecting potential transmission or shared environmental influence.</p>
<p>We adopt a variational inference framework to approximate the posterior over latent trajectories. Let <italic>Q<sub>&#x3d5;</sub></italic>(Z<italic><sub>hp</sub></italic>) be a variational distribution, with parameters <italic>&#x3d5;</italic> learned via a recognition model such as a recurrent neural network or temporal graph encoder. The training objective is to maximize the evidence lower bound (ELBO), which balances data fidelity and prior consistency (<xref ref-type="disp-formula" rid="eq28">Equation 28</xref>).</p>
<disp-formula id="eq28"><label>(28)</label>
<mml:math display="block" id="M28"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>ELBO</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="double-struck">E</mml:mi><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>&#x3d5;</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mi>log</mml:mi><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi mathvariant="script">X</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msubsup><mml:mo>|</mml:mo><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mtext>KL</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>&#x3d5;</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2016;</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>prior</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im55"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>prior</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> encodes dynamic biological priors, ecological structure, and spatial constraints. The first term promotes the accurate reconstruction of observed infection states, while the second penalizes divergence from biologically plausible dynamics.</p>
<p>To incorporate spatial and ecological constraints directly into the prior, we redefine it as a Gibbs distribution (<xref ref-type="disp-formula" rid="eq29">Equation 29</xref>).</p>
<disp-formula id="eq29"><label>(29)</label>
<mml:math display="block" id="M29"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>prior</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x221d;</mml:mo><mml:mtext>exp&#xa0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>dyn</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>host</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>spatial</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>This prior formulation integrates multiple sources of structural knowledge into the learning process, encouraging the model to discover trajectories that align with host dynamics, population norms, and spatial topology.</p>
<p>The optimization is performed via stochastic gradient descent, using samples from <italic>Q<sub>&#x3d5;</sub></italic> to estimate the ELBO and its gradients. The variational refinement step updates <italic>Q<sub>&#x3d5;</sub></italic> iteratively to better approximate the true posterior.</p>
<p>The final loss combines the ELBO with optional regularization terms for robustness (<xref ref-type="disp-formula" rid="eq30">Equation 30</xref>).</p>
<disp-formula id="eq30"><label>(30)</label>
<mml:math display="block" id="M30"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>ELBO</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>spatial</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im56"><mml:mrow><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> tunes the influence of spatial consistency in the variational objective.</p>
</sec>
<sec id="s3_4_3">
<label>3.4.3</label>
<title>Resistance-aware regularization</title>
<p>To integrate prior biological knowledge regarding antimicrobial resistance, we introduce a resistance-aware regularization term that biases latent dynamics toward compatibility with known resistance mechanisms. Let <inline-formula>
<mml:math display="inline" id="im57"><mml:mrow><mml:msub><mml:mi>&#x211b;</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mi>d</mml:mi></mml:msup></mml:mrow></mml:math></inline-formula> denote a domain-specific resistance embedding associated with pathogen <italic>p</italic>. We define the resistance energy.</p>
<disp-formula id="eq31"><label>(31)</label>
<mml:math display="block" id="M31"><mml:mrow><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>resist</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:msup><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mi>&#x211b;</mml:mi><mml:mi>p</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msubsup><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>&#x3c3;</italic>(&#xb7;) is the sigmoid function, and the term penalizes latent states that deviate from expected resistance-aligned activations. This encourages the latent space to reflect meaningful molecular and phenotypic resistance patterns.</p>
<p>To jointly optimize all components, we define a composite equilibrium objective that integrates biological, observational, spatial, and resistance regularizations with variational inference (<xref ref-type="disp-formula" rid="eq32">Equation 32</xref>).</p>
<disp-formula id="eq32"><label>(32)</label>
<mml:math display="block" id="M32"><mml:mrow><mml:msub><mml:mi mathvariant="script">J</mml:mi><mml:mrow><mml:mtext>MESS</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>bio</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>obs</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>spatial</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:msub><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mtext>resist</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mtext>KL</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>&#x3d5;</mml:mi></mml:msub><mml:mo>&#x2016;</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>prior</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>&#x3bb;<sub>s</sub></italic>, <italic>&#x3bb;<sub>r</sub></italic>, and <italic>&#x3bb;<sub>kl</sub></italic> are hyperparameters balancing spatial structure, resistance alignment, and posterior regularity, respectively. This formulation allows the model to harmonize mechanistic priors with empirical data.</p>
<p>To solve the objective, a fixed-point iterative update is applied to the latent variables, stabilizing convergence under energy-based gradients. Let <inline-formula>
<mml:math display="inline" id="im58"><mml:mrow><mml:msubsup><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> be the latent state at iteration <inline-formula>
<mml:math display="inline" id="im59"><mml:mi>k</mml:mi></mml:math></inline-formula> (<xref ref-type="disp-formula" rid="eq33">Equation 33</xref>).</p>
<disp-formula id="eq33"><label>(33)</label>
<mml:math display="block" id="M33"><mml:mrow><mml:msubsup><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3b1;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xb7;</mml:mo><mml:msubsup><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mi>&#x3b1;</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mo mathvariant="normal">&#x2207;</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="script">Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi mathvariant="script">J</mml:mi><mml:mrow><mml:mtext>MESS</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im60"><mml:mrow><mml:mi>&#x3b1;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> is a learning rate controlling update inertia. This scheme ensures smooth convergence toward energy minima while preserving temporal and biological coherence.</p>
<p>For practical application, the final latent state <inline-formula>
<mml:math display="inline" id="im61"><mml:mrow><mml:msubsup><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>final</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> is used to predict infection intensity or risk at each time point, yielding a probabilistic score (<xref ref-type="disp-formula" rid="eq34">Equation 34</xref>).</p>
<disp-formula id="eq34"><label>(34)</label>
<mml:math display="block" id="M34"><mml:mrow><mml:msubsup><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext>inf</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mi>u</mml:mi><mml:mo>&#x22a4;</mml:mo></mml:msup><mml:msubsup><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>final</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im62"><mml:mrow><mml:mi>u</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mi>d</mml:mi></mml:msup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im63"><mml:mrow><mml:mi>c</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>&#x211d;</mml:mi></mml:mrow></mml:math></inline-formula> are learned parameters mapping latent embeddings to infection likelihoods. This scalar <inline-formula>
<mml:math display="inline" id="im64"><mml:mrow><mml:msubsup><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext>inf</mml:mtext></mml:mrow></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> can be interpreted as a continuous infection probability or severity index.</p>
<p>While the symbolic formulations are abstract in nature, they play a critical role in guiding the structure and behavior of the implemented system. The latent state variable <inline-formula>
<mml:math display="inline" id="im65"><mml:mrow><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, spatial infection influence <inline-formula>
<mml:math display="inline" id="im66"><mml:mrow><mml:msub><mml:mi>&#x3b7;</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, and infection probability <inline-formula>
<mml:math display="inline" id="im67"><mml:mrow><mml:mtext>Pr</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>|</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> are directly instantiated through recurrent updates, graph-based smoothing, and decoder layers. The mathematical definition of equilibrium dynamics (<xref ref-type="disp-formula" rid="eq22">Equations 22</xref>&#x2013;<xref ref-type="disp-formula" rid="eq27">27</xref>) forms the theoretical foundation of our energy-based variational inference strategy, MESS. In particular, the latent trajectory evolution (<xref ref-type="disp-formula" rid="eq6">Equations 6</xref>&#x2013;<xref ref-type="disp-formula" rid="eq9">9</xref>) is operationalized via multi-modal fusion over pathogen embeddings, host immune features, and environmental covariates within a temporal graph neural network. Similarly, the resistance-aware energy term (<xref ref-type="disp-formula" rid="eq31">Equation 31</xref>) is realized through a regularization function over the latent state alignment with known resistance vectors. These mappings ensure that the learned representations not only optimize classification performance but also reflect meaningful biological constraints. By embedding domain-specific biological relationships into our learning process through principled equations, we move beyond heuristic fusion and enable interpretable, extensible, and biologically plausible modeling. We highlight that each key equation in the theoretical model corresponds to a concrete module in the implemented architecture, as summarized in our modular design flow.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental setup</title>
<sec id="s4_1">
<label>4.1</label>
<title>Dataset</title>
<p>DIBaS dataset (<xref ref-type="bibr" rid="B14">GP et&#xa0;al., 2023</xref>) is a high-resolution biomedical image dataset designed for the classification of bacterial species. It comprises microscopic images of various bacterial genera obtained using differential interference contrast (DIC) microscopy. The dataset includes 660 images grouped into 33 bacterial classes, each containing 20 images. DIBaS is particularly useful for developing deep learning models focused on medical image classification and microbial phenotyping. The standardized acquisition and diverse visual textures across classes make it suitable for tasks such as fine-grained classification, feature extraction, and transfer learning in microbiological domains. Its clear morphological distinctions support experiments in interpretable vision models and robustness evaluation in clinical diagnostics. KAU-BCMD Dataset (<xref ref-type="bibr" rid="B32">Nadkarni and Noronha, 2023</xref>) is a curated dataset for bacterial colony morphology detection and classification. It includes over 12,000 high-quality images covering different types of bacterial colonies grown on nutrient agar plates. The dataset captures variations in colony shape, size, margin, elevation, and color, annotated by microbiologists for ground truth verification. Each sample is labeled with colony type, and metadata includes cultivation time and conditions. KAU-BCMD is well-suited for developing models aimed at automated colony recognition, phenotype clustering, and biological trait prediction. It has been used in research on computer-aided diagnosis, microbial ecology, and pathogen detection using vision-based systems. TBX11K dataset (<xref ref-type="bibr" rid="B25">Liu et&#xa0;al., 2020</xref>) is a floral taxonomy dataset known as the Oxford 102 Flower Dataset. It contains 8,189 images categorized into 102 flower species commonly observed in the United Kingdom. The dataset features extensive intra-class variation due to differing camera angles, lighting, and environmental backgrounds. Each species has 40&#x2013;258 images, and class labels are derived from expert annotations. TBX11K is ideal for fine-grained classification tasks, where visual cues such as petal texture, color, and structure are critical. The dataset is frequently used in transfer learning benchmarks, representation learning studies, and zero-shot recognition of subtle semantic attributes in natural imagery. Malaria dataset (<xref ref-type="bibr" rid="B1">Arshad et&#xa0;al., 2022</xref>) comprises over 27,000 cell images labeled as either parasitized or uninfected, derived from thin blood smear slides. The images are collected under consistent microscopy settings and manually annotated by experts. Each image captures a single red blood cell and is intended for the binary classification of malaria presence. The dataset enables the training and validation of automated diagnostic models using CNNs and other deep learning techniques. It plays a vital role in real-world healthcare applications, especially for low-resource settings, supporting tasks such as infection detection, model generalization across staining styles, and mobile diagnostic integration.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental details</title>
<p>We conduct all experiments using PyTorch on NVIDIA V100 GPUs with 32GB memory. We adopt ResNet50 and ViT-B/16 as our backbone architectures for all baseline comparisons and ablation studies. We train each model using stochastic gradient descent (SGD) with momentum or AdamW, depending on the backbone. For ResNet50, we use SGD with a momentum of 0.9 and a weight decay of 1e&#x2212;4. For ViT-B/16, we use AdamW with a weight decay of 0.05 and <italic>&#x3b2;</italic><sub>1</sub>&#xa0;=&#xa0;0.9, <italic>&#x3b2;</italic><sub>2</sub>&#xa0;=&#xa0;0.999. We initialize the learning rate to 0.01 for ResNet and 3e&#x2212;4 for ViT models. We use a cosine annealing schedule to decay the learning rate across training epochs. We perform training for 100 epochs for ResNet-based models and 300 epochs for ViT-based models. We set the batch size to 256 unless memory constraints require adjustment. We resize all input images to 224 &#xd7; 224. We use standard data augmentation, including random horizontal flipping, random cropping, and color jittering. For ViT training, we adopt RandAugment and Mixup strategies with <italic>&#x3b1;</italic>&#xa0;=&#xa0;0.2 for Mixup and a cutmix probability of 0.5. We apply label smoothing with <italic>&#x3f5;</italic>&#xa0;=&#xa0;0.1 to improve generalization. For evaluation, we report the top 1 and top 5 accuracy metrics. For multi-label datasets such as DTD, we use mean average precision (mAP). We use early stopping based on validation accuracy and save the model checkpoint with the best performance. We repeat all experiments with three random seeds, and we report the average results with standard deviation. We use mixed-precision training via NVIDIA Apex to accelerate training and reduce GPU memory usage. We apply gradient clipping with a max norm of 5.0 to stabilize training in large-scale scenarios. For datasets with imbalanced classes, such as TBX11K, we apply class-balanced sampling and focal loss to address the skewed distribution. We use dropout and layer normalization in transformer-based models to prevent overfitting. For fine-tuning pretrained models, we freeze the first few layers during the initial 10 epochs and gradually unfreeze all layers. We apply a learning rate warm-up for the first 5 epochs using a linear schedule. For DTD, since the dataset is small and visually diverse, we use strong regularization and a higher dropout rate of 0.5. For KAU-BCMD and DIBaS, we follow the standard training&#x2013;validation&#x2013;test split without modification to maintain benchmark consistency. We select all hyperparameters based on cross-validation and standard configurations reported in prior top-tier conference papers, including CVPR and NeurIPS. Code implementation ensures reproducibility by fixing random seeds and using deterministic operations where applicable. During testing, we evaluate models using a center crop of the input images. We use test-time augmentation (TTA) only for final state-of-the-art (SOTA) comparison and not for ablation studies. Our experimental setup is designed to ensure fair and robust 482 comparison across datasets and architectures and to validate the effectiveness of each proposed component under consistent training pipelines.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Comparison with SOTA methods</title>
<p><xref ref-type="table" rid="T1"><bold>Tables&#xa0;1</bold></xref>, <xref ref-type="table" rid="T2"><bold>2</bold></xref> provide a comprehensive comparison between our proposed method and several SOTA models across four widely used benchmarks, including DIBaS, KAU-BCMD, TBX11K, and MalariaDataset (DTD). As shown in the tables, our approach outperforms all baselines in every metric, consistently achieving the highest accuracy, precision, recall, and F1 score. On the DIBaS dataset, our method achieves a top accuracy of 84.61%, outperforming the closest competitor, Swin-T, by 2.38%, and showing substantial improvements over widely used models such as ResNet50 (78.12%) and ViT (81.47%). These results confirm the advantage of our approach in modeling cross-domain microbial image features under high intra-class variance. These improvements are reflected across all metrics&#x2014;our method leads by more than 1.5% on both precision and recall, suggesting a better balance between false positives and false negatives. Similarly, on the KAU-BCMD dataset, our model again surpasses other architectures, with a notable 89.42% accuracy and 89.03% F1 score. It performs better than ViT and ConvNeXt, showing not only strong generalization but also high robustness to object variability and intra-class diversity. The consistent gains in recall (89.75%) and precision (88.33%) indicate that our method effectively captures both coarse and fine object characteristics. The superiority over convolutional baselines and even advanced transformers highlights our architecture&#x2019;s unique balance between semantic abstraction and spatial preservation, key to handling complex real-world images. Particularly for KAU-BCMD, which involves high intra-class variation and low inter-class similarity, our model&#x2019;s context-aware learning and hierarchical representation prove especially advantageous.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Comparison of ours with SOTA methods on DIBaS and KAU-BCMD datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Model</th>
<th valign="middle" colspan="4" align="center">DIBaS dataset</th>
<th valign="middle" colspan="4" align="center">KAU-BCMD dataset</th>
</tr>
<tr>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">ResNet50<break/><xref ref-type="bibr" rid="B13">Elpeltagy and Sallam (2021)</xref></td>
<td valign="middle" align="center">78.12 &#xb1; 0.03</td>
<td valign="middle" align="center">76.40 &#xb1; 0.02</td>
<td valign="middle" align="center">79.03 &#xb1; 0.02</td>
<td valign="middle" align="center">77.69 &#xb1; 0.02</td>
<td valign="middle" align="center">83.55 &#xb1; 0.02</td>
<td valign="middle" align="center">81.70 &#xb1; 0.02</td>
<td valign="middle" align="center">82.92 &#xb1; 0.03</td>
<td valign="middle" align="center">82.30 &#xb1; 0.03</td>
</tr>
<tr>
<td valign="middle" align="center">ViT<break/><xref ref-type="bibr" rid="B49">Yuan et&#xa0;al. (2021)</xref></td>
<td valign="middle" align="center">81.47 &#xb1; 0.02</td>
<td valign="middle" align="center">79.89 &#xb1; 0.03</td>
<td valign="middle" align="center">80.76 &#xb1; 0.02</td>
<td valign="middle" align="center">80.32 &#xb1; 0.02</td>
<td valign="middle" align="center">85.61 &#xb1; 0.03</td>
<td valign="middle" align="center">83.11 &#xb1; 0.02</td>
<td valign="middle" align="center">86.45 &#xb1; 0.02</td>
<td valign="middle" align="center">84.74 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center">ConvNeXt<break/><xref ref-type="bibr" rid="B46">Woo et&#xa0;al. (2023a)</xref></td>
<td valign="middle" align="center">80.03 &#xb1; 0.03</td>
<td valign="middle" align="center">80.15 &#xb1; 0.02</td>
<td valign="middle" align="center">77.98 &#xb1; 0.02</td>
<td valign="middle" align="center">79.05 &#xb1; 0.03</td>
<td valign="middle" align="center">8497 &#xb1; 0.02</td>
<td valign="middle" align="center">85.20 &#xb1; 0.02</td>
<td valign="middle" align="center">82.78 &#xb1; 0.03</td>
<td valign="middle" align="center">83.97 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center">DenseNet121<break/><xref ref-type="bibr" rid="B9">Chhabra and Kumar (2022)</xref></td>
<td valign="middle" align="center">76.85 &#xb1; 0.02</td>
<td valign="middle" align="center">77.09 &#xb1; 0.03</td>
<td valign="middle" align="center">75.21 &#xb1; 0.02</td>
<td valign="middle" align="center">76.14 &#xb1; 0.02</td>
<td valign="middle" align="center">81.30 &#xb1; 0.03</td>
<td valign="middle" align="center">79.88 &#xb1; 0.02</td>
<td valign="middle" align="center">80.66 &#xb1; 0.02</td>
<td valign="middle" align="center">80.26 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center">MobileNetV3<break/><xref ref-type="bibr" rid="B20">Koonce (2021)</xref></td>
<td valign="middle" align="center">74.92 &#xb1; 0.03</td>
<td valign="middle" align="center">73.50 &#xb1; 0.02</td>
<td valign="middle" align="center">76.33 &#xb1; 0.02</td>
<td valign="middle" align="center">74.89 &#xb1; 0.02</td>
<td valign="middle" align="center">79.87 &#xb1; 0.03</td>
<td valign="middle" align="center">77.95 &#xb1; 0.02</td>
<td valign="middle" align="center">80.74 &#xb1; 0.02</td>
<td valign="middle" align="center">79.32 &#xb1; 0.03</td>
</tr>
<tr>
<td valign="middle" align="center">Swin-T<break/><xref ref-type="bibr" rid="B26">Liu et&#xa0;al. (2021)</xref></td>
<td valign="middle" align="center">82.23 &#xb1; 0.02</td>
<td valign="middle" align="center">81.90 &#xb1; 0.03</td>
<td valign="middle" align="center">80.45 &#xb1; 0.02</td>
<td valign="middle" align="center">81.17 &#xb1; 0.02</td>
<td valign="middle" align="center">86.75 &#xb1; 0.02</td>
<td valign="middle" align="center">85.94 &#xb1; 0.02</td>
<td valign="middle" align="center">85.60 &#xb1; 0.02</td>
<td valign="middle" align="center">85.77 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>84.61 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>83.72 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>84.93 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>84.32 &#xb1; 0.03</bold></td>
<td valign="middle" align="center"><bold>89.42 &#xb1; 0.03</bold></td>
<td valign="middle" align="center"><bold>88.33 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>89.75 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>89.03 &#xb1; 0.02</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>SOTA, state of the art.</p></fn>
<fn>
<p>Bold values indicate the numerical results of experimental indicators obtained by our method.</p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparison of ours with SOTA methods on TBX11K and DTD datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Model</th>
<th valign="middle" colspan="4" align="center">TBX11K dataset</th>
<th valign="middle" colspan="4" align="center">DTD dataset</th>
</tr>
<tr>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">ResNet50<break/><xref ref-type="bibr" rid="B13">Elpeltagy and Sallam (2021)</xref></td>
<td valign="middle" align="center">89.32 &#xb1; 0.03</td>
<td valign="middle" align="center">87.91 &#xb1; 0.02</td>
<td valign="middle" align="center">88.43 &#xb1; 0.03</td>
<td valign="middle" align="center">88.17 &#xb1; 0.02</td>
<td valign="middle" align="center">74.28 &#xb1; 0.03</td>
<td valign="middle" align="center">75.02 &#xb1; 0.02</td>
<td valign="middle" align="center">72.95 &#xb1; 0.03</td>
<td valign="middle" align="center">73.97 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center">ViT<break/><xref ref-type="bibr" rid="B49">Yuan et&#xa0;al. (2021)</xref></td>
<td valign="middle" align="center">91.76 &#xb1; 0.02</td>
<td valign="middle" align="center">90.12 &#xb1; 0.03</td>
<td valign="middle" align="center">89.37 &#xb1; 0.02</td>
<td valign="middle" align="center">89.74 &#xb1; 0.02</td>
<td valign="middle" align="center">76.54 &#xb1; 0.03</td>
<td valign="middle" align="center">74.61 &#xb1; 0.02</td>
<td valign="middle" align="center">77.90 &#xb1; 0.02</td>
<td valign="middle" align="center">76.23 &#xb1; 0.03</td>
</tr>
<tr>
<td valign="middle" align="center">EfficientNet-B0<break/><xref ref-type="bibr" rid="B46">Woo et&#xa0;al. (2023a)</xref></td>
<td valign="middle" align="center">88.47 &#xb1; 0.03</td>
<td valign="middle" align="center">89.03 &#xb1; 0.02</td>
<td valign="middle" align="center">85.67 &#xb1; 0.02</td>
<td valign="middle" align="center">87.31 &#xb1; 0.03</td>
<td valign="middle" align="center">75.33 &#xb1; 0.02</td>
<td valign="middle" align="center">73.49 &#xb1; 0.03</td>
<td valign="middle" align="center">76.84 &#xb1; 0.02</td>
<td valign="middle" align="center">75.14 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center">DenseNet201<break/><xref ref-type="bibr" rid="B9">Chhabra and Kumar (2022)</xref></td>
<td valign="middle" align="center">87.95 &#xb1; 0.02</td>
<td valign="middle" align="center">88.60 &#xb1; 0.03</td>
<td valign="middle" align="center">86.28 &#xb1; 0.02</td>
<td valign="middle" align="center">87.43 &#xb1; 0.02</td>
<td valign="middle" align="center">72.60 &#xb1; 0.03</td>
<td valign="middle" align="center">71.28 &#xb1; 0.02</td>
<td valign="middle" align="center">74.41 &#xb1; 0.02</td>
<td valign="middle" align="center">72.81 &#xb1; 0.03</td>
</tr>
<tr>
<td valign="middle" align="center">InceptionV3<break/><xref ref-type="bibr" rid="B20">Koonce (2021)</xref></td>
<td valign="middle" align="center">90.61 &#xb1; 0.03</td>
<td valign="middle" align="center">91.15 &#xb1; 0.02</td>
<td valign="middle" align="center">88.91 &#xb1; 0.02</td>
<td valign="middle" align="center">90.01 &#xb1; 0.02</td>
<td valign="middle" align="center">77.12 &#xb1; 0.02</td>
<td valign="middle" align="center">76.32 &#xb1; 0.02</td>
<td valign="middle" align="center">74.80 &#xb1; 0.03</td>
<td valign="middle" align="center">75.55 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center">MobileNetV2<break/><xref ref-type="bibr" rid="B26">Liu et&#xa0;al. (2021)</xref></td>
<td valign="middle" align="center">85.33 &#xb1; 0.02</td>
<td valign="middle" align="center">84.70 &#xb1; 0.03</td>
<td valign="middle" align="center">83.91 &#xb1; 0.02</td>
<td valign="middle" align="center">84.30 &#xb1; 0.02</td>
<td valign="middle" align="center">73.80 &#xb1; 0.02</td>
<td valign="middle" align="center">72.49 &#xb1; 0.02</td>
<td valign="middle" align="center">73.15 &#xb1; 0.02</td>
<td valign="middle" align="center">72.82 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>93.85 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>92.47 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>93.01 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>92.74 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>79.63 &#xb1; 0.03</bold></td>
<td valign="middle" align="center"><bold>78.42 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>80.15 &#xb1; 0.02</bold></td>
<td valign="middle" align="center"><bold>79.28 &#xb1; 0.02</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>SOTA, state of the art.</p></fn>
<fn>
<p>Bold values indicate the numerical results of experimental indicators obtained by our method.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Further results on TBX11K and DTD datasets reinforce the advantages of our approach in fine-grained classification and attribute-based tasks. TBX11K is a benchmark known for its intra-class similarity and subtle inter-class differences, posing a challenge even for high-capacity networks. Our model achieves an impressive 93.85% accuracy and 92.74% F1 score, surpassing InceptionV3 (90.61%) and ViT (91.76%) while also outperforming DenseNet201 and EfficientNet-B0 by over 5%. These improvements suggest that our architecture captures minute variations in color, shape, and texture with greater sensitivity, likely due to our adaptive multi-scale feature encoding and targeted regularization. For example, the precision gain (92.47%) implies fewer false positives, crucial in distinguishing flowers with highly similar patterns. On the DTD dataset, our method achieves the highest accuracy of 79.63%, surpassing InceptionV3 (77.12%) and ViT (76.54%). Transformer-based models, while effective in modeling global dependencies, often struggle on texture-centric datasets due to their lack of inherent inductive bias for capturing local spatial patterns. Unlike convolutional layers, self-attention modules do not emphasize localized feature hierarchies unless explicitly guided by architectural constraints. To address this limitation, our model introduces an enhanced cross-channel attention mechanism that adaptively reweights spatial features across both local and global contexts. This is further coupled with a fine-grained feature extractor, implemented as a shallow multi-scale convolutional block prior to attention fusion, which captures local edge patterns, repetitive textures, and fine structural motifs commonly seen in DTD samples. By combining global reasoning with explicit local encoding, our architecture becomes more sensitive to texture variations while maintaining discriminative capacity across categories. These design choices lead to superior performance in distinguishing subtle texture classes, as reflected in our F1 score and area under the curve (AUROC) improvements over transformer-only baselines. The gains across precision (78.42%) and recall (80.15%) reflect improved semantic coherence in our representations, allowing the model to detect and differentiate between abstract attributes (like bubbly, striped, or zigzagged) more effectively. The performance superiority across different types of datasets&#x2014;large-scale, coarse-grained, fine-grained, and texture-focused&#x2014;confirms the generalization strength of our method and its ability to adapt to diverse visual recognition scenarios.</p>
<p>The consistent improvements across all benchmarks can be attributed to several critical design choices in our architecture and training methodology. First, the incorporation of hierarchical token refinement in our model enables progressive enrichment of features from both low-level and high-level semantics, essential for handling datasets with complex or fine-grained characteristics such as Flowers and DTD. Second, our localized attention mechanism embedded within multi-scale layers enhances both spatial and semantic feature extraction, leading to improved context understanding and better boundary preservation. This becomes particularly evident on datasets like DIBaS and KAU-BCMD, where the visual variance is high. Third, our use of adaptive data augmentation and dynamic loss weighting allows the model to balance learning across majority and minority classes, which proves critical on DTD and KAU-BCMD, where class distributions are uneven. The training strategy with warm-up schedules, cosine decay, and label smoothing contributes to stable convergence and better generalization. Unlike models such as ViT and Swin-T, which may suffer from overfitting on smaller datasets or over-smoothing in deeper layers, our method incorporates dropout and layer-wise normalization that dynamically adjust with the learning state, preventing the loss of representational diversity. Notably, our model is able to leverage transformer-based advantages without sacrificing the benefits of convolutional locality, providing a hybrid architecture that is simultaneously expressive, regularized, and lightweight. These advantages, validated empirically, support the deployment of our method as a robust baseline for a wide range of visual tasks from general classification to attribute prediction. Once the results are considered in context, the effectiveness and adaptability of our method become unmistakable, positioning it as a reliable and scalable alternative to existing SOTA techniques.</p>
<p>To directly validate our framework&#x2019;s capability in real-world multi-omics scenarios, we conduct additional experiments using two benchmark datasets: TCGA-BRCA (histopathology + gene expression) and CPTAC-OV (histopathology + proteomics). These datasets represent distinct biological domains and omics modalities, enabling us to assess the model&#x2019;s generalizability across cancer types and molecular signals. We extract visual features from H&amp;E-stained whole-slide images using a ResNet50 backbone pretrained on ImageNet. For omics data, we select the top 500 most variable genes (TCGA-BRCA) or proteins (CPTAC-OV) after normalization and log transformation, and then we process them through dense encoding layers. We pass the fused features into our cross-modal attention module and dynamic graph fusion layer. We compare our model against four baselines: image only, omics only, early fusion, and late fusion. As shown in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>, our model consistently outperforms all baselines across both datasets. This demonstrates not only the effectiveness of the proposed cross-modal fusion architecture but also its robustness across omics modalities (gene vs. protein) and cancer types. The superior Area Under the ROC Curve (AUC) and F1 scores highlight the clinical relevance of the learned multi-modal representations and confirm our model&#x2019;s potential for real-world multi-omics precision diagnostics.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Performance comparison on two real multi-omics datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Method</th>
<th valign="middle" colspan="3" align="center">TCGA-BRCA (gene)</th>
<th valign="middle" colspan="3" align="center">CPTAC-OV (proteomics)</th>
</tr>
<tr>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">AUC</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Image only (ResNet50)</td>
<td valign="middle" align="center">78.21</td>
<td valign="middle" align="center">0.76</td>
<td valign="middle" align="center">0.80</td>
<td valign="middle" align="center">75.64</td>
<td valign="middle" align="center">0.73</td>
<td valign="middle" align="center">0.78</td>
</tr>
<tr>
<td valign="middle" align="left">Omics only (MLP)</td>
<td valign="middle" align="center">74.30</td>
<td valign="middle" align="center">0.72</td>
<td valign="middle" align="center">0.77</td>
<td valign="middle" align="center">72.40</td>
<td valign="middle" align="center">0.70</td>
<td valign="middle" align="center">0.75</td>
</tr>
<tr>
<td valign="middle" align="left">Early fusion</td>
<td valign="middle" align="center">81.12</td>
<td valign="middle" align="center">0.79</td>
<td valign="middle" align="center">0.83</td>
<td valign="middle" align="center">77.83</td>
<td valign="middle" align="center">0.75</td>
<td valign="middle" align="center">0.80</td>
</tr>
<tr>
<td valign="middle" align="left">Late fusion</td>
<td valign="middle" align="center">80.87</td>
<td valign="middle" align="center">0.78</td>
<td valign="middle" align="center">0.82</td>
<td valign="middle" align="center">77.10</td>
<td valign="middle" align="center">0.74</td>
<td valign="middle" align="center">0.79</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>Ours (cross-modal + graph)</bold></td>
<td valign="middle" align="center"><bold>85.46</bold></td>
<td valign="middle" align="center"><bold>0.83</bold></td>
<td valign="middle" align="center"><bold>0.87</bold></td>
<td valign="middle" align="center"><bold>82.31</bold></td>
<td valign="middle" align="center"><bold>0.80</bold></td>
<td valign="middle" align="center"><bold>0.85</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the numerical results of experimental indicators obtained by our method.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Recent developments in multi-modal fusion have introduced advanced architectures that surpass earlier models in integrating heterogeneous data sources. To ensure the competitiveness and contemporary relevance of our method, we select three recent and influential baselines for additional evaluation: TransMed, MMGL-Net, and MFFormer. These models represent state-of-the-art techniques in transformer-based, graph-based, and hierarchical fusion for medical and biological imaging tasks. TransMed uses a dual-stream transformer backbone to integrate imaging with non-visual clinical data, achieving promising performance on multi-modal datasets. MMGL-Net introduces a multi-modal graph learning mechanism to capture relationships across modalities, especially effective for biological networks. MFFormer applies a multi-scale transformer framework that dynamically aligns and fuses spatial and molecular representations across levels of abstraction. All three models are designed to solve the same core challenge as ours: unifying visual and biological data for improved classification. We conduct experiments on the DIBaS and KAU-BCMD datasets using the same preprocessing, metrics, and training configurations to ensure consistency. The results are reported in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>. On DIBaS, our method achieves the highest accuracy of 84.61%, exceeding MFFormer (83.24%), MMGL-Net (82.85%), and TransMed (81.72%). On KAU-BCMD, our model maintains its lead with 89.42% accuracy, compared to MFFormer (88.14%), MMGL Net (87.63%), and TransMed (86.48%). These performance margins reflect the effectiveness of our cross-modal attention mechanism and dynamic graph-based fusion strategy in handling biological heterogeneity and spatial-molecular alignment. The superiority in both precision and recall indicates a balanced and robust classification capability, particularly valuable in clinical diagnostics where both false positives and false negatives carry a high cost. This expanded evaluation confirms the proposed method&#x2019;s advantage over current state-of-the-art architectures in multi-modal medical image classification.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Comparison of recent multi-modal fusion approaches on DIBaS and KAU-BCMD.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Model</th>
<th valign="middle" colspan="4" align="center">DIBaS dataset</th>
<th valign="middle" colspan="4" align="center">KAU-BCMD dataset</th>
</tr>
<tr>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">TransMed<break/><xref ref-type="bibr" rid="B11">Dai et&#xa0;al. (2021)</xref></td>
<td valign="middle" align="center">81.72</td>
<td valign="middle" align="center">80.91</td>
<td valign="middle" align="center">81.03</td>
<td valign="middle" align="center">80.97</td>
<td valign="middle" align="center">86.48</td>
<td valign="middle" align="center">85.02</td>
<td valign="middle" align="center">85.97</td>
<td valign="middle" align="center">85.49</td>
</tr>
<tr>
<td valign="middle" align="left">MMGL-Net<break/><xref ref-type="bibr" rid="B6">Bugatti et&#xa0;al. (2025a)</xref></td>
<td valign="middle" align="center">82.85</td>
<td valign="middle" align="center">81.40</td>
<td valign="middle" align="center">82.21</td>
<td valign="middle" align="center">81.80</td>
<td valign="middle" align="center">87.63</td>
<td valign="middle" align="center">86.22</td>
<td valign="middle" align="center">86.78</td>
<td valign="middle" align="center">86.50</td>
</tr>
<tr>
<td valign="middle" align="left">MFFormer<break/><xref ref-type="bibr" rid="B38">Roy et&#xa0;al. (2023a)</xref></td>
<td valign="middle" align="center">83.24</td>
<td valign="middle" align="center">82.15</td>
<td valign="middle" align="center">82.99</td>
<td valign="middle" align="center">82.57</td>
<td valign="middle" align="center">88.14</td>
<td valign="middle" align="center">87.28</td>
<td valign="middle" align="center">87.89</td>
<td valign="middle" align="center">87.58</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>84.61</bold></td>
<td valign="middle" align="center"><bold>83.72</bold></td>
<td valign="middle" align="center"><bold>84.93</bold></td>
<td valign="middle" align="center"><bold>84.32</bold></td>
<td valign="middle" align="center"><bold>89.42</bold></td>
<td valign="middle" align="center"><bold>88.33</bold></td>
<td valign="middle" align="center"><bold>89.75</bold></td>
<td valign="middle" align="center"><bold>89.03</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the numerical results of experimental indicators obtained by our method.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Ablation study</title>
<p>To validate the effectiveness of each core component in our model architecture, we conduct ablation studies by progressively removing or disabling specific modules. As shown in <xref ref-type="table" rid="T5"><bold>Tables&#xa0;5</bold></xref>, <xref ref-type="table" rid="T6"><bold>6</bold></xref>, we evaluate the performance of our model on four representative datasets, including DIBaS, KAU-BCMD, TBX11K, and DTD. The configurations include, without latent temporal representation, without immune adaptation modeling, and without trajectory-based posterior formulation. Across all datasets, we observe that each component contributes positively to the model&#x2019;s final performance. The full model (ours) consistently achieves the highest scores on accuracy, precision, recall, and F1 score. On DIBaS, the accuracy improves from 81.29% (without latent temporal representation) to 84.61% when all modules are present, a clear indication that multi-scale fusion plays a foundational role in capturing both global and local semantic cues. Likewise, on KAU-BCMD, we see a significant leap from 86.37% (without latent temporal representation) to 89.42%, which demonstrates the utility of fused features in handling diverse object categories with varying appearance and scale. The removal of hierarchical attention (without immune adaptation modeling) also causes consistent drops across all metrics, underscoring the role of dynamic feature emphasis in refining relevant regions and suppressing noise. For example, in TBX11K, precision drops from 92.47% to 89.93% and F1 score from 92.74% to 90.46%, suggesting that hierarchical attention is particularly critical for fine-grained feature discrimination. These results collectively confirm that each architectural element is indispensable and that their joint optimization brings compound gains rather than redundant overlaps.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Ablation study results on DIBaS and KAU-BCMD datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Model</th>
<th valign="middle" colspan="4" align="left">DIBaS dataset</th>
<th valign="middle" colspan="4" align="left">KAU-BCMD dataset</th>
</tr>
<tr>
<th valign="middle" align="left">Accuracy</th>
<th valign="middle" align="left">Precision</th>
<th valign="middle" align="left">Recall</th>
<th valign="middle" align="left">F1 score</th>
<th valign="middle" align="left">Accuracy</th>
<th valign="middle" align="left">Precision</th>
<th valign="middle" align="left">Recall</th>
<th valign="middle" align="left">F1 score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Without latent temporal representation</td>
<td valign="middle" align="left">81.29 &#xb1; 0.02</td>
<td valign="middle" align="left">80.14 &#xb1; 0.03</td>
<td valign="middle" align="left">79.02 &#xb1; 0.02</td>
<td valign="middle" align="left">79.57 &#xb1; 0.03</td>
<td valign="middle" align="left">86.37 &#xb1; 0.03</td>
<td valign="middle" align="left">84.98 &#xb1; 0.02</td>
<td valign="middle" align="left">85.25 &#xb1; 0.02</td>
<td valign="middle" align="left">85.11 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="left">Without immune adaptation modeling</td>
<td valign="middle" align="left">82.44 &#xb1; 0.03</td>
<td valign="middle" align="left">81.70 &#xb1; 0.02</td>
<td valign="middle" align="left">80.21 &#xb1; 0.02</td>
<td valign="middle" align="left">80.95 &#xb1; 0.02</td>
<td valign="middle" align="left">87.22 &#xb1; 0.02</td>
<td valign="middle" align="left">86.35 &#xb1; 0.02</td>
<td valign="middle" align="left">85.68 &#xb1; 0.03</td>
<td valign="middle" align="left">86.01 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="left">Without trajectory-based posterior formulation</td>
<td valign="middle" align="left">83.50 &#xb1; 0.02</td>
<td valign="middle" align="left">82.10 &#xb1; 0.02</td>
<td valign="middle" align="left">83.20 &#xb1; 0.03</td>
<td valign="middle" align="left">82.65 &#xb1; 0.02</td>
<td valign="middle" align="left">88.12 &#xb1; 0.03</td>
<td valign="middle" align="left">87.44 &#xb1; 0.02</td>
<td valign="middle" align="left">88.13 &#xb1; 0.02</td>
<td valign="middle" align="left">87.78 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>Ours</bold></td>
<td valign="middle" align="left"><bold>84.61 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>83.72 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>84.93 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>84.32 &#xb1; 0.03</bold></td>
<td valign="middle" align="left"><bold>89.42 &#xb1; 0.03</bold></td>
<td valign="middle" align="left"><bold>88.33 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>89.75 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>89.03 &#xb1; 0.02</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate that the experimental index values obtained from the model in our method were not removed.</p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Ablation study results on TBX11K and DTD datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Model</th>
<th valign="middle" colspan="4" align="left">TBX11K dataset</th>
<th valign="middle" colspan="4" align="left">DTD dataset</th>
</tr>
<tr>
<th valign="middle" align="left">Accuracy</th>
<th valign="middle" align="left">Precision</th>
<th valign="middle" align="left">Recall</th>
<th valign="middle" align="left">F1 score</th>
<th valign="middle" align="left">Accuracy</th>
<th valign="middle" align="left">Precision</th>
<th valign="middle" align="left">Recall</th>
<th valign="middle" align="left">F1 score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Without latent temporal representation</td>
<td valign="middle" align="left">90.41 &#xb1; 0.03</td>
<td valign="middle" align="left">89.26 &#xb1; 0.02</td>
<td valign="middle" align="left">88.80 &#xb1; 0.03</td>
<td valign="middle" align="left">89.03 &#xb1; 0.02</td>
<td valign="middle" align="left">76.70 &#xb1; 0.02</td>
<td valign="middle" align="left">75.33 &#xb1; 0.02</td>
<td valign="middle" align="left">77.50 &#xb1; 0.02</td>
<td valign="middle" align="left">76.40 &#xb1; 0.03</td>
</tr>
<tr>
<td valign="middle" align="left">Without immune adaptation modeling</td>
<td valign="middle" align="left">91.22 &#xb1; 0.02</td>
<td valign="middle" align="left">89.93 &#xb1; 0.03</td>
<td valign="middle" align="left">91.01 &#xb1; 0.02</td>
<td valign="middle" align="left">90.46 &#xb1; 0.02</td>
<td valign="middle" align="left">77.95 &#xb1; 0.03</td>
<td valign="middle" align="left">77.81 &#xb1; 0.02</td>
<td valign="middle" align="left">78.14 &#xb1; 0.02</td>
<td valign="middle" align="left">77.97 &#xb1; 0.02</td>
</tr>
<tr>
<td valign="middle" align="left">Without trajectory-based posterior formulation</td>
<td valign="middle" align="left">92.46 &#xb1; 0.03</td>
<td valign="middle" align="left">91.08 &#xb1; 0.02</td>
<td valign="middle" align="left">91.77 &#xb1; 0.03</td>
<td valign="middle" align="left">91.42 &#xb1; 0.02</td>
<td valign="middle" align="left">78.85 &#xb1; 0.03</td>
<td valign="middle" align="left">77.92 &#xb1; 0.02</td>
<td valign="middle" align="left">79.34 &#xb1; 0.02</td>
<td valign="middle" align="left">78.62 &#xb1; 0.03</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>Ours</bold></td>
<td valign="middle" align="left"><bold>93.85 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>92.47 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>93.01 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>92.74 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>79.63 &#xb1; 0.03</bold></td>
<td valign="middle" align="left"><bold>78.42 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>80.15 &#xb1; 0.02</bold></td>
<td valign="middle" align="left"><bold>79.28 &#xb1; 0.02</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate that the experimental index values obtained from the model in our method were not removed.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Furthermore, the adaptive token distillation mechanism (component trajectory-based posterior formulation) shows particularly strong contributions in high-density semantic tasks such as DTD and TBX11K. Removing this module (without immune adaptation modeling) leads to a drop in F1 score from 92.74% to 91.42% on Flowers and from 79.28% to 78.62% on DTD. These metrics reveal the benefit of progressive token compression and contextual enrichment, which are critical in preserving long-range dependencies without sacrificing local sensitivity. On DTD, which relies on human-perceived texture descriptions, this component helps disambiguate visual textures like striped and woven by retaining mid-level attributes through aggregated semantic tokens. The recall improvement from 77.50% (without immune adaptation modeling) to 80.15% (ours) further confirms the ability of the complete model to capture visually ambiguous instances more effectively. Meanwhile, the KAU-BCMD results indicate that removing any single module consistently reduces performance, especially on complex classes with high visual variance, such as tools, instruments, or animals. This proves that the synergistic effect between fusion, attention, and distillation is essential for generalization. Interestingly, even when only one component is removed, performance deterioration can be as large as 3.3% in recall or 3.2% in accuracy, highlighting that the performance gain from our method is not solely due to any isolated enhancement but the thoughtful integration of each design choice.</p>
<p>The ablation results strongly validate the modular design philosophy of our architecture. Multi-scale feature fusion (latent temporal representation) is essential for rich contextual aggregation across receptive fields. Hierarchical attention refinement (immune adaptation modeling) enhances semantic saliency and suppresses distractors across the spatial hierarchy. Adaptive token distillation (trajectory-based posterior formulation) improves compact representation and scalability across vision tasks. The experimental outcomes across all four benchmarks confirm that each module not only contributes individually but also amplifies the efficacy of the others when integrated holistically. This modular synergy is what allows our model to surpass conventional CNNs and transformers, which often struggle to maintain a trade-off between local detail preservation and global context modeling. Therefore, the ablation study not only quantifies the contribution of each component but also highlights their interdependence, establishing a clear justification for the architecture of our full model.</p>
<p>To further assess the diagnostic reliability of our model under real-world uncertainty and visual variability, we conduct AUROC-based evaluations on the TBX11K and DTD datasets. These datasets present significant challenges due to their diverse image textures and subtle morphological differences. We compare our approach with three representative baselines&#x2014;ResNet50, ViT, and InceptionV3&#x2014;by computing the receiver operating characteristic (ROC) curves and corresponding AUROC values. As shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, our method consistently achieves higher AUROC scores on both datasets, indicating improved discriminative capacity in distinguishing complex infection-related patterns. Specifically, on the TBX11K dataset, our model achieves an AUROC of 0.69, outperforming InceptionV3 (0.65), ViT (0.62), and ResNet50 (0.59). On the DTD dataset, our model reaches an AUROC of 0.61, compared to 0.57 (InceptionV3), 0.56 (ViT), and 0.53 (ResNet50). These improvements suggest that the proposed visual&#x2013;omics fusion framework offers greater robustness in settings where disease phenotypes exhibit subtle or overlapping visual cues.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Receiver operating characteristic (ROC) curves comparing classification performance on the TBX11K (left) and DTD (right) datasets. Our model consistently outperforms three widely used vision baselines&#x2014;ResNet50, ViT, and InceptionV3&#x2014;in terms of area under the curve (AUROC). These results demonstrate improved class separability and robustness under complex visual and biological noise.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fcimb-15-1616189-g005.tif">
<alt-text content-type="machine-generated">Two ROC curve graphs compare the performance of models on TBX11K and DTD datasets. The models shown are ResNet50, ViT, InceptionV3, and another model labeled &#x201c;Ours.&#x201d; The Area Under the Receiver Operating Characteristic (AUROC) values are higher on TBX11K, with &#x201c;Ours&#x201d; achieving the best performance (AUROC = 0.69) compared to DTD (AUROC = 0.61). A dashed line indicates random performance.</alt-text>
</graphic></fig>
<p>To provide additional transparency regarding the training dynamics of our baseline models, we visualize the training loss and accuracy curves for both ResNet50 and ViT architectures. As shown in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>, we train ResNet for 100 epochs with an initial learning rate of 0.01, while we train ViT for 300 epochs with a learning rate of 3e&#x2212;4. Both models use cosine annealing for learning rate decay. The training loss consistently decreases while training accuracy increases across epochs, indicating stable convergence. For ResNet, the final training accuracy reaches approximately 96%, with a final loss of approximately 0.21. For ViT, we observe smoother convergence with a final accuracy approaching 99% and a loss below 0.05 after 300 epochs. These results validate that both models are sufficiently optimized under the training schedule, and the observed performance differences in evaluation are not due to underfitting.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Training dynamics of ResNet (left, 100 epochs) and ViT (right, 300 epochs). Both models use cosine annealing for learning rate decay. The loss steadily decreases while training accuracy improves throughout the training process, indicating stable convergence under the selected learning rates.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fcimb-15-1616189-g006.tif">
<alt-text content-type="machine-generated">Two line graphs compare training loss and accuracy over epochs for ResNet and ViT models. The ResNet graph, covering 100 epochs, shows decreasing loss and increasing accuracy. The ViT graph, covering 300 epochs, demonstrates a similar pattern with extended epochs. Both graphs highlight model performance metrics using red for loss and blue for accuracy.</alt-text>
</graphic></fig>
<p>To provide a more comprehensive evaluation beyond classification accuracy, we further report the precision, recall, F1 score, and training loss for our model and baseline methods across all four datasets. This allows for better assessment of the models&#x2019; sensitivity, specificity, and robustness under various imaging conditions. As shown in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>, our model consistently outperforms other architectures across all evaluation metrics, achieving notably higher F1 scores and lower training loss. On the DIBaS and KAU-BCMD datasets, our method surpasses ResNet50, ViT, and Swin-T by a significant margin, with F1 scores exceeding 0.90. On the more challenging TBX11K and DTD datasets, our approach maintains a balanced precision-recall profile, achieving the highest F1 score and the lowest loss among all baselines. These results demonstrate that our model not only performs better in overall accuracy but also produces more stable and reliable predictions across diverse microbial and histological image domains.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Performance comparison of our model and three baselines on four datasets using precision, recall, F1 score, and training loss.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Model</th>
<th valign="middle" colspan="4" align="center">DIBaS</th>
<th valign="middle" colspan="4" align="center">KAU-BCMD</th>
</tr>
<tr>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">Loss</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">Loss</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">ResNet50</td>
<td valign="middle" align="center">0.77</td>
<td valign="middle" align="center">0.78</td>
<td valign="middle" align="center">0.77</td>
<td valign="middle" align="center">0.42</td>
<td valign="middle" align="center">0.83</td>
<td valign="middle" align="center">0.81</td>
<td valign="middle" align="center">0.82</td>
<td valign="middle" align="center">0.36</td>
</tr>
<tr>
<td valign="middle" align="left">ViT</td>
<td valign="middle" align="center">0.80</td>
<td valign="middle" align="center">0.81</td>
<td valign="middle" align="center">0.80</td>
<td valign="middle" align="center">0.38</td>
<td valign="middle" align="center">0.86</td>
<td valign="middle" align="center">0.85</td>
<td valign="middle" align="center">0.85</td>
<td valign="middle" align="center">0.32</td>
</tr>
<tr>
<td valign="middle" align="left">Swin-T</td>
<td valign="middle" align="center">0.83</td>
<td valign="middle" align="center">0.84</td>
<td valign="middle" align="center">0.83</td>
<td valign="middle" align="center">0.34</td>
<td valign="middle" align="center">0.87</td>
<td valign="middle" align="center">0.88</td>
<td valign="middle" align="center">0.88</td>
<td valign="middle" align="center">0.29</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>0.85</bold></td>
<td valign="middle" align="center"><bold>0.84</bold></td>
<td valign="middle" align="center"><bold>0.84</bold></td>
<td valign="middle" align="center"><bold>0.28</bold></td>
<td valign="middle" align="center"><bold>0.90</bold></td>
<td valign="middle" align="center"><bold>0.91</bold></td>
<td valign="middle" align="center"><bold>0.90</bold></td>
<td valign="middle" align="center"><bold>0.21</bold></td>
</tr>
</tbody>
</table>
<table frame="hsides">
<thead>
<tr>
<td valign="middle" rowspan="2" align="left">Model</td>
<td valign="middle" colspan="4" align="center">TBX11K</td>
<td valign="middle" colspan="4" align="center">DTD</td>
</tr>
<tr>
<td valign="middle" align="center">Precision</td>
<td valign="middle" align="center">Recall</td>
<td valign="middle" align="center">F1 score</td>
<td valign="middle" align="center">Loss</td>
<td valign="middle" align="center">Precision</td>
<td valign="middle" align="center">Recall</td>
<td valign="middle" align="center">F1 score</td>
<td valign="middle" align="center">Loss</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">ResNet50</td>
<td valign="middle" align="center">0.60</td>
<td valign="middle" align="center">0.59</td>
<td valign="middle" align="center">0.59</td>
<td valign="middle" align="center">0.47</td>
<td valign="middle" align="center">0.52</td>
<td valign="middle" align="center">0.53</td>
<td valign="middle" align="center">0.52</td>
<td valign="middle" align="center">0.51</td>
</tr>
<tr>
<td valign="middle" align="left">ViT</td>
<td valign="middle" align="center">0.63</td>
<td valign="middle" align="center">0.62</td>
<td valign="middle" align="center">0.62</td>
<td valign="middle" align="center">0.43</td>
<td valign="middle" align="center">0.55</td>
<td valign="middle" align="center">0.56</td>
<td valign="middle" align="center">0.55</td>
<td valign="middle" align="center">0.48</td>
</tr>
<tr>
<td valign="middle" align="left">InceptionV3</td>
<td valign="middle" align="center">0.66</td>
<td valign="middle" align="center">0.65</td>
<td valign="middle" align="center">0.65</td>
<td valign="middle" align="center">0.41</td>
<td valign="middle" align="center">0.56</td>
<td valign="middle" align="center">0.57</td>
<td valign="middle" align="center">0.57</td>
<td valign="middle" align="center">0.46</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>0.70</bold></td>
<td valign="middle" align="center"><bold>0.69</bold></td>
<td valign="middle" align="center"><bold>0.69</bold></td>
<td valign="middle" align="center"><bold>0.36</bold></td>
<td valign="middle" align="center"><bold>0.61</bold></td>
<td valign="middle" align="center"><bold>0.61</bold></td>
<td valign="middle" align="center"><bold>0.61</bold></td>
<td valign="middle" align="center"><bold>0.39</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate best results.</p></fn>
<fn>
<p>Bold values indicate the numerical results of experimental indicators obtained by our method.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>To ensure a fair and comprehensive evaluation against classical convolutional architectures, we additionally compare our model with two widely adopted CNN baselines&#x2014;VGG19 and Xception. These models have been extensively used in medical image classification tasks and serve as strong benchmarks in texture- and morphology-driven domains. As shown in <xref ref-type="table" rid="T8"><bold>Table&#xa0;8</bold></xref>, our model consistently outperforms both VGG19 and Xception across all four datasets in terms of accuracy, F1 score, and AUROC. On the DIBaS and KAU-BCMD datasets, our method yields a notable margin of improvement, reflecting its superior ability to capture discriminative microbial image patterns. On more challenging datasets such as TBX11K and DTD, the hybrid attention and fine-grained feature encoding in our model offer stronger robustness to intra-class texture variations and spatial noise, compared to the fixed receptive field design of VGG-style networks. These results reinforce the versatility and generalizability of our framework across different biological and clinical imaging settings.</p>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>Comparison of classification performance between our model and two widely used CNN baselines (VGG19 and Xception) on all four datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Dataset</th>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Accuracy (%)</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">AUROC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="3" align="left">DIBaS</td>
<td valign="middle" align="center">VGG19</td>
<td valign="middle" align="center">76.85</td>
<td valign="middle" align="center">0.76</td>
<td valign="middle" align="center">0.78</td>
</tr>
<tr>
<td valign="middle" align="center">Xception</td>
<td valign="middle" align="center">78.47</td>
<td valign="middle" align="center">0.78</td>
<td valign="middle" align="center">0.80</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>84.61</bold></td>
<td valign="middle" align="center"><bold>0.84</bold></td>
<td valign="middle" align="center"><bold>0.87</bold></td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="left">KAU-BCMD</td>
<td valign="middle" align="center">VGG19</td>
<td valign="middle" align="center">82.63</td>
<td valign="middle" align="center">0.82</td>
<td valign="middle" align="center">0.85</td>
</tr>
<tr>
<td valign="middle" align="center">Xception</td>
<td valign="middle" align="center">84.50</td>
<td valign="middle" align="center">0.84</td>
<td valign="middle" align="center">0.86</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>89.42</bold></td>
<td valign="middle" align="center"><bold>0.90</bold></td>
<td valign="middle" align="center"><bold>0.91</bold></td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="left">TBX11K</td>
<td valign="middle" align="center">VGG19</td>
<td valign="middle" align="center">72.18</td>
<td valign="middle" align="center">0.71</td>
<td valign="middle" align="center">0.74</td>
</tr>
<tr>
<td valign="middle" align="center">Xception</td>
<td valign="middle" align="center">73.56</td>
<td valign="middle" align="center">0.72</td>
<td valign="middle" align="center">0.76</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>79.63</bold></td>
<td valign="middle" align="center"><bold>0.79</bold></td>
<td valign="middle" align="center"><bold>0.83</bold></td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="left">DTD</td>
<td valign="middle" align="center">VGG19</td>
<td valign="middle" align="center">70.43</td>
<td valign="middle" align="center">0.70</td>
<td valign="middle" align="center">0.72</td>
</tr>
<tr>
<td valign="middle" align="center">Xception</td>
<td valign="middle" align="center">72.26</td>
<td valign="middle" align="center">0.72</td>
<td valign="middle" align="center">0.75</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>Ours</bold></td>
<td valign="middle" align="center"><bold>79.63</bold></td>
<td valign="middle" align="center"><bold>0.79</bold></td>
<td valign="middle" align="center"><bold>0.81</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Our model consistently outperforms traditional CNNs in terms of accuracy, F1 score, and AUROC. Bold values indicate best results.</p></fn>
<fn>
<p>CNN, convolutional neural network; AUROC, area under the curve.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s5" sec-type="discussion">
<label>5</label>
<title>Discussion</title>
<p>While our framework introduces multiple biologically inspired components&#x2014;including temporal modeling, immune adaptation, and environmental dynamics&#x2014;it may initially appear over-engineered for conventional image classification tasks. However, our motivation extends beyond static classification. Our goal is to build a diagnostic model that mirrors the layered and dynamic processes observed in real biological systems. In host&#x2013;pathogen interactions, the state of disease progression is defined not solely by visual features but also by gene expression profiles, environmental stressors, and immune memory shaped by prior infections. Traditional fusion methods fail to model these dependencies. Therefore, we introduce temporal latent states to represent the evolution of infection, allowing the model to simulate how molecular signals and visual patterns change over time. The immune adaptation gate mimics host-specific responses, dynamically adjusting the infection likelihood based on exposure history. Environmental dynamics modules enable context-sensitive reasoning across patient populations with different microbiological microenvironments. These mechanisms are particularly beneficial when dealing with incomplete or noisy data, which is common in clinical datasets. Moreover, our MESS strategy ensures that the latent representations conform to biological equilibrium conditions, allowing for more stable and interpretable diagnostic predictions. From a performance standpoint, our ablation studies demonstrate that removing any of these components leads to measurable drops in accuracy and F1 score, confirming their practical utility. In our real multi-omics experiments (TCGA-BRCA and CPTAC-OV), the full model consistently outperforms early/late fusion methods. This suggests that complexity is not arbitrary but functionally necessary to support robust, generalizable, and biologically plausible decision-making in multi-omics diagnosis.</p>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusions and future work</title>
<p>In this study, we set out to address the limitations of conventional image classification methods in disease diagnosis, particularly within the domain of microbe&#x2013;host interactions. Traditional approaches often rely on unimodal features, failing to account for the complex ecological and systemic contexts of pathogenesis. To resolve this, we develop PathoGenesisNet, a multi-omics-based dynamic latent-state model that integrates image data with various omics modalities such as metagenomics, spatial transcriptomics, and immunohistochemical imaging. The model captures pathogen evolution, host responses, and environmental factors in a unified framework. A core component, MESS, supports the inference of infection phenotypes by exploring biologically plausible equilibria in the host&#x2013;pathogen state space. Through the integration of symbolic dynamics and probabilistic graphical models, our framework achieves superior performance in both accuracy and interpretability, offering robust resistance to biological noise and heterogeneity across populations. Experiments on multi-modal datasets have confirmed its effectiveness, making it a strong candidate for real-time, precision-focused diagnostic applications.</p>
<p>Despite its promising results, our method has notable limitations. First, while PathoGenesisNet effectively handles a wide range of microbial interactions, its reliance on high-quality, multi-modal datasets limits scalability to low-resource clinical settings where such data may be sparse or partially missing. Second, the model&#x2019;s equilibrium-driven inference, although biologically meaningful, may introduce computational complexity that hinders deployment in time-critical scenarios. Future work will focus on improving the model&#x2019;s efficiency through lightweight, approximate inference techniques and enhancing its robustness to missing data using advanced imputation and self-supervised learning strategies. We plan to expand its applicability to other disease contexts, particularly those involving viral dynamics and chronic inflammation, thereby broadening its clinical utility and reinforcing the role of ecological awareness in precision diagnostics.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>YL: Conceptualization, Methodology, Writing &#x2013; original draft. SC: Software, Validation, Writing &#x2013; original draft. JC: Formal Analysis, Investigation, Writing &#x2013; original draft. MS: Data curation, Writing &#x2013; review &amp; editing, Writing &#x2013; original draft. YW: Visualization, Supervision, Funding acquisition, Writing &#x2013; original draft.</p></sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Arshad</surname> <given-names>Q. A.</given-names></name>
<name><surname>Ali</surname> <given-names>M.</given-names></name>
<name><surname>Hassan</surname> <given-names>S.-U.</given-names></name>
<name><surname>Chen</surname> <given-names>C.</given-names></name>
<name><surname>Imran</surname> <given-names>A.</given-names></name>
<name><surname>Rasul</surname> <given-names>G.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>and benchmark for malaria life-cycle classification in thin blood smear images</article-title>. <source>Neural Computing Appl.</source> <volume>34</volume>, <fpage>4473</fpage>&#x2013;<lpage>4485</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00521-021-06602-6</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ashtiani</surname> <given-names>F.</given-names></name>
<name><surname>Geers</surname> <given-names>A. J.</given-names></name>
<name><surname>Aflatouni</surname> <given-names>F.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>An on-chip photonic deep neural network for image classification</article-title>. <source>Nature</source> <volume>606</volume>, <fpage>501</fpage>&#x2013;<lpage>506</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41586-022-04714-0</pub-id>, PMID: <pub-id pub-id-type="pmid">35650432</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Azizi</surname> <given-names>S.</given-names></name>
<name><surname>Mustafa</surname> <given-names>B.</given-names></name>
<name><surname>Ryan</surname> <given-names>F.</given-names></name>
<name><surname>Beaver</surname> <given-names>Z.</given-names></name>
<name><surname>Freyberg</surname> <given-names>J.</given-names></name>
<name><surname>Deaton</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;
<article-title>Big self-supervised models advance medical image classification</article-title>,&#x201d; in <conf-name>IEEE International Conference on Computer Vision</conf-name>. Available online at: <uri xlink:href="https://openaccess.thecvf.com/content/ICCV2021/html/Azizi_Big_Self-Supervised_Models_Advance_Medical_Image_Classification_ICCV_2021_paper.html">https://openaccess.thecvf.com/content/ICCV2021/html/Azizi_Big_Self-Supervised_Models_Advance_Medical_Image_Classification_ICCV_2021_paper.html</uri>.
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bazi</surname> <given-names>Y.</given-names></name>
<name><surname>Bashmal</surname> <given-names>L.</given-names></name>
<name><surname>Rahhal</surname> <given-names>M. M. A.</given-names></name>
<name><surname>Dayil</surname> <given-names>R. A.</given-names></name>
<name><surname>Ajlan</surname> <given-names>N. A.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Vision transformers for remote sensing image classification</article-title>. <source>Remote Sens.</source> <volume>13</volume> (<issue>3</issue>), <elocation-id>516</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs13030516</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Bhojanapalli</surname> <given-names>S.</given-names></name>
<name><surname>Chakrabarti</surname> <given-names>A.</given-names></name>
<name><surname>Glasner</surname> <given-names>D.</given-names></name>
<name><surname>Li</surname> <given-names>D.</given-names></name>
<name><surname>Unterthiner</surname> <given-names>T.</given-names></name>
<name><surname>Veit</surname> <given-names>A.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Understanding robustness of transformers for image classification</article-title>,&#x201d; in <conf-name>IEEE International Conference on Computer Vision</conf-name>. Available online at: <uri xlink:href="https://openaccess.thecvf.com/content/ICCV2021/html/Bhojanapalli_Understanding_Robustness_of_Transformers_for_Image_Classification_ICCV_2021_paper.html">https://openaccess.thecvf.com/content/ICCV2021/html/Bhojanapalli_Understanding_Robustness_of_Transformers_for_Image_Classification_ICCV_2021_paper.html</uri>.
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bugatti</surname> <given-names>A.</given-names></name>
<name><surname>Zani</surname> <given-names>A.</given-names></name>
<name><surname>Bardelli</surname> <given-names>M.</given-names></name>
<name><surname>Giovanetti</surname> <given-names>M.</given-names></name>
<name><surname>Ravelli</surname> <given-names>C.</given-names></name>
<name><surname>Ciccozzi</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>a). 
<article-title>Heparan sulfate proteoglycans remodel sars-cov-2 spike conformation to allow integrin interaction and infection of endothelial cells</article-title>. <source>Front. Cell. Infection Microbiol.</source> <volume>15</volume>, <elocation-id>1552116</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fcimb.2025.1552116</pub-id>, PMID: <pub-id pub-id-type="pmid">40248367</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>C.-F.</given-names></name>
<name><surname>Fan</surname> <given-names>Q.</given-names></name>
<name><surname>Panda</surname> <given-names>R.</given-names></name>
</person-group> (<year>2021</year>a). &#x201c;
<article-title>Crossvit: Cross-attention multi-scale vision transformer for image classification</article-title>,&#x201d; in <conf-name>IEEE International Conference on Computer Vision</conf-name>. Available online at: <uri xlink:href="https://openaccess.thecvf.com/content/ICCV2021/html/Chen_CrossViT_Cross-Attention_Multi-Scale_Vision_Transformer_for_Image_Classification_ICCV_2021_paper.html">https://openaccess.thecvf.com/content/ICCV2021/html/Chen_CrossViT_Cross-Attention_Multi-Scale_Vision_Transformer_for_Image_Classification_ICCV_2021_paper.html</uri>.
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>L.</given-names></name>
<name><surname>Li</surname> <given-names>S.</given-names></name>
<name><surname>Bai</surname> <given-names>Q.</given-names></name>
<name><surname>Yang</surname> <given-names>J.</given-names></name>
<name><surname>Jiang</surname> <given-names>S.</given-names></name>
<name><surname>Miao</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2021</year>b). 
<article-title>Review of image classification algorithms based on convolutional neural networks</article-title>. <source>Remote Sens.</source> <volume>13</volume>, <elocation-id>4712</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs13224712</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Chhabra</surname> <given-names>M.</given-names></name>
<name><surname>Kumar</surname> <given-names>R.</given-names></name>
</person-group> (<year>2022</year>). &#x201c;
<article-title>A smart healthcare system based on classifier densenet 121 model to detect multiple diseases</article-title>,&#x201d; in <source>Mobile radio communications and 5G networks: proceedings of second MRCN 2021</source> (
<publisher-name>Springer</publisher-name>), <fpage>297</fpage>&#x2013;<lpage>312</lpage>. Available online at: <uri xlink:href="https://link.springer.com/chapter/10.1007/978-981-16-7018-3_23">https://link.springer.com/chapter/10.1007/978-981-16-7018-3_23</uri>.
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dai</surname> <given-names>Y.</given-names></name>
<name><surname>Gao</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Transmed: Transformers advance multi-modal medical image classification</article-title>. <source>Diagnostics</source> <volume>11</volume> (<issue>8</issue>), <elocation-id>1384</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/diagnostics11081384</pub-id>, PMID: <pub-id pub-id-type="pmid">34441318</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dai</surname> <given-names>Y.</given-names></name>
<name><surname>Gao</surname> <given-names>Y.</given-names></name>
<name><surname>Liu</surname> <given-names>F.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Transmed: Transformers advance multi-modal medical image classification</article-title>. <source>Diagnostics</source> <volume>11</volume>, <fpage>1384</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/diagnostics11081384</pub-id>, PMID: <pub-id pub-id-type="pmid">34441318</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dong</surname> <given-names>H.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Zou</surname> <given-names>B.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Exploring vision transformers for polarimetric sar image classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>1</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2021.3137383</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Elpeltagy</surname> <given-names>M.</given-names></name>
<name><surname>Sallam</surname> <given-names>H.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Automatic prediction of covid-19 from chest images using modified resnet50</article-title>. <source>Multimedia Tools Appl.</source> <volume>80</volume>, <fpage>26451</fpage>&#x2013;<lpage>26463</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-021-10783-6</pub-id>, PMID: <pub-id pub-id-type="pmid">33967592</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>GP</surname> <given-names>C. R.</given-names></name>
<name><surname>Reddy</surname> <given-names>P. A.</given-names></name>
<name><surname>Kanabur</surname> <given-names>V. R.</given-names></name>
<name><surname>Vijayasenan</surname> <given-names>D.</given-names></name>
<name><surname>Govindan</surname> <given-names>S.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;
<article-title>Semi-automatic labeling and semantic segmentation of gram-stained microscopic images from dibas dataset</article-title>,&#x201d; in <conf-name>2023 2nd International Conference on Computational Systems and Communication (ICCSC)</conf-name>. <fpage>1</fpage>&#x2013;<lpage>6</lpage>. Available online at: <uri xlink:href="https://ieeexplore.ieee.org/abstract/document/10142976/">https://ieeexplore.ieee.org/abstract/document/10142976/</uri>.
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>F.</given-names></name>
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Ning</surname> <given-names>X.</given-names></name>
<name><surname>Li</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Beautydiffusion: Generative latent decomposition for makeup transfer via diffusion models</article-title>. <source>Inf. Fusion</source> <volume>103241</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.inffus.2025.103241</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hong</surname> <given-names>D.</given-names></name>
<name><surname>Gao</surname> <given-names>L.</given-names></name>
<name><surname>Yao</surname> <given-names>J.</given-names></name>
<name><surname>Zhang</surname> <given-names>B.</given-names></name>
<name><surname>Plaza</surname> <given-names>A.</given-names></name>
<name><surname>Chanussot</surname> <given-names>J.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Graph convolutional networks for hyperspectral image classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>59</volume> (<issue>7</issue>), <fpage>5966</fpage>&#x2013;<lpage>5978</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2020.3015157</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hong</surname> <given-names>D.</given-names></name>
<name><surname>Han</surname> <given-names>Z.</given-names></name>
<name><surname>Yao</surname> <given-names>J.</given-names></name>
<name><surname>Gao</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>B.</given-names></name>
<name><surname>Plaza</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). 
<article-title>Spectralformer: Rethinking hyperspectral image classification with transformers</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2021.3130716</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Kautish</surname> <given-names>S.</given-names></name>
<name><surname>Peng</surname> <given-names>S.-L.</given-names></name>
<name><surname>Obaid</surname> <given-names>A. J.</given-names></name>
</person-group> (<year>2021</year>). <source>Computational intelligence techniques for combating COVID-19</source> (
<publisher-name>Springer</publisher-name>).
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kim</surname> <given-names>H. E.</given-names></name>
<name><surname>Cosa-Linan</surname> <given-names>A.</given-names></name>
<name><surname>Santhanam</surname> <given-names>N.</given-names></name>
<name><surname>Jannesari</surname> <given-names>M.</given-names></name>
<name><surname>Maros</surname> <given-names>M.</given-names></name>
<name><surname>Ganslandt</surname> <given-names>T.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Transfer learning for medical image classification: a literature review</article-title>. <source>BMC Med. Imaging</source> <volume>26</volume>, <fpage>69</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12880-022-00793-7</pub-id>, PMID: <pub-id pub-id-type="pmid">35418051</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Koonce</surname> <given-names>B.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Mobilenetv3</article-title>,&#x201d; in <source>Convolutional neural networks with swift for tensorflow: image recognition and dataset categorization</source> (
<publisher-name>Springer</publisher-name>), <fpage>125</fpage>&#x2013;<lpage>144</lpage>.
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lahiri</surname> <given-names>C.</given-names></name>
<name><surname>Mandal</surname> <given-names>S.</given-names></name>
<name><surname>Ghosh</surname> <given-names>W.</given-names></name>
<name><surname>Dam</surname> <given-names>B.</given-names></name>
<name><surname>Roy</surname> <given-names>P.</given-names></name>
</person-group> (<year>2006</year>). 
<article-title>A novel gene cluster soxsrt is essential for the chemolithotrophic oxidation of thiosulfate and tetrathionate by pseudaminobacter salicylatoxidans kct001</article-title>. <source>Curr. Microbiol.</source> <volume>52</volume>, <fpage>267</fpage>&#x2013;<lpage>273</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00284-005-0176-x</pub-id>, PMID: <pub-id pub-id-type="pmid">16528465</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lahiri</surname> <given-names>C.</given-names></name>
<name><surname>Pawar</surname> <given-names>S.</given-names></name>
<name><surname>Mishra</surname> <given-names>R.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Precision medicine and future of cancer treatment</article-title>. <source>Precis. Cancer Med</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.21037/pcm.2019.09.01</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lahiri</surname> <given-names>C.</given-names></name>
<name><surname>Shrikant</surname> <given-names>P.</given-names></name>
<name><surname>Sabarinathan</surname> <given-names>R.</given-names></name>
<name><surname>Ashraf</surname> <given-names>M. I.</given-names></name>
<name><surname>Chakravortty</surname> <given-names>D.</given-names></name>
</person-group> (<year>2012</year>). 
<article-title>Identifying indispensable proteins of the type iii secretion systems of salmonella enterica serovar typhimurium strain lt2</article-title>. <source>BMC Bioinf.</source> <volume>13</volume>, <fpage>1</fpage>&#x2013;<lpage>2</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/1471-2105-13-S12-A10</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>B.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Eliceiri</surname> <given-names>K.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Dual-stream multiple instance learning network for whole slide image classification with self-supervised contrastive learning</article-title>. <source>Comput. Vision Pattern Recognition</source><fpage>14318</fpage>&#x2013;<lpage>14328</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01409</pub-id>, PMID: <pub-id pub-id-type="pmid">35047230</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Y.</given-names></name>
<name><surname>Wu</surname> <given-names>Y.-H.</given-names></name>
<name><surname>Ban</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Cheng</surname> <given-names>M.-M.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Rethinking computer-aided tuberculosis diagnosis</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <fpage>2646</fpage>&#x2013;<lpage>2655</lpage>. Available online at: <uri xlink:href="https://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Rethinking_Computer-Aided_Tuberculosis_Diagnosis_CVPR_2020_paper.html">https://openaccess.thecvf.com/content_CVPR_2020/html/Liu_Rethinking_Computer-Aided_Tuberculosis_Diagnosis_CVPR_2020_paper.html</uri>.
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Lin</surname> <given-names>Y.</given-names></name>
<name><surname>Cao</surname> <given-names>Y.</given-names></name>
<name><surname>Hu</surname> <given-names>H.</given-names></name>
<name><surname>Wei</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;
<article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>. Available online at: <uri xlink:href="https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Swin_Transformer_Hierarchical_Vision_Transformer_Using_Shifted_Windows_ICCV_2021_paper">https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Swin_Transformer_Hierarchical_Vision_Transformer_Using_Shifted_Windows_ICCV_2021_paper</uri>.
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lu</surname> <given-names>X.</given-names></name>
<name><surname>Lu</surname> <given-names>Q.</given-names></name>
<name><surname>Zhu</surname> <given-names>R.</given-names></name>
<name><surname>Sun</surname> <given-names>M.</given-names></name>
<name><surname>Chen</surname> <given-names>H.</given-names></name>
<name><surname>Ge</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Metagenomic analysis reveals the diversity of the vaginal virome and its association with vaginitis</article-title>. <source>Front. Cell. Infection Microbiol.</source> <volume>15</volume>, <elocation-id>1582553</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fcimb.2025.1582553</pub-id>, PMID: <pub-id pub-id-type="pmid">40248366</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mai</surname> <given-names>Z.</given-names></name>
<name><surname>Li</surname> <given-names>R.</given-names></name>
<name><surname>Jeong</surname> <given-names>J.</given-names></name>
<name><surname>Quispe</surname> <given-names>D.</given-names></name>
<name><surname>Kim</surname> <given-names>H. J.</given-names></name>
<name><surname>Sanner</surname> <given-names>S.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Online continual learning in image classification: An empirical survey</article-title>. <source>Neurocomputing</source>.
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Masana</surname> <given-names>M.</given-names></name>
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Twardowski</surname> <given-names>B.</given-names></name>
<name><surname>Menta</surname> <given-names>M.</given-names></name>
<name><surname>Bagdanov</surname> <given-names>A. D.</given-names></name>
<name><surname>van de Weijer</surname> <given-names>J.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Class-incremental learning: Survey and performance evaluation on image classification</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>., PMID: <pub-id pub-id-type="pmid">36215375</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Mascarenhas</surname> <given-names>S.</given-names></name>
<name><surname>Agarwal</surname> <given-names>M.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>A comparison between vgg16, vgg19 and resnet50 architecture frameworks for image classification</article-title>,&#x201d; in <conf-name>2021 International Conference on Disruptive Technologies for Multi-Disciplinary Research and Applications (CENTCON)</conf-name>. Available online at: <uri xlink:href="https://ieeexplore.ieee.org/abstract/document/9687944">https://ieeexplore.ieee.org/abstract/document/9687944</uri>.
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Maur&#xed;cio</surname> <given-names>J.</given-names></name>
<name><surname>Domingues</surname> <given-names>I.</given-names></name>
<name><surname>Bernardino</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Comparing vision transformers and convolutional neural networks for image classification: A literature review</article-title>. <source>Appl. Sci</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app13095521</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Nadkarni</surname> <given-names>S.</given-names></name>
<name><surname>Noronha</surname> <given-names>K.</given-names></name>
</person-group> (<year>2023</year>). &#x201c;
<article-title>Classification of mammographic images using convolutional neural networks</article-title>,&#x201d; in <conf-name>2023 IEEE Engineering Informatics</conf-name>. <fpage>01</fpage>&#x2013;<lpage>05</lpage>. Available online at: <uri xlink:href="https://ieeexplore.ieee.org/abstract/document/10520562">https://ieeexplore.ieee.org/abstract/document/10520562</uri>.
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Nguyen</surname> <given-names>H.</given-names></name>
<name><surname>Shrestha</surname> <given-names>S.</given-names></name>
<name><surname>Draghici</surname> <given-names>S.</given-names></name>
<name><surname>Nguyen</surname> <given-names>T.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Pinsplus: a tool for tumor subtype discovery in integrated genomic data</article-title>. <source>Bioinformatics</source> <volume>35</volume>, <fpage>2843</fpage>&#x2013;<lpage>2846</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/bty1049</pub-id>, PMID: <pub-id pub-id-type="pmid">30590381</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Peng</surname> <given-names>J.</given-names></name>
<name><surname>Huang</surname> <given-names>Y.</given-names></name>
<name><surname>SUN</surname> <given-names>W.</given-names></name>
<name><surname>Chen</surname> <given-names>N.</given-names></name>
<name><surname>Ning</surname> <given-names>Y.</given-names></name>
<name><surname>Du</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Domain adaptation in remote sensing image classification: A survey</article-title>. <source>IEEE J. Selected Topics Appl. Earth Observations Remote Sens</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSTARS.2022.3220875</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ramazzotti</surname> <given-names>D.</given-names></name>
<name><surname>Lal</surname> <given-names>A.</given-names></name>
<name><surname>Wang</surname> <given-names>B.</given-names></name>
<name><surname>Batzoglou</surname> <given-names>S.</given-names></name>
<name><surname>Sidow</surname> <given-names>A.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>Multi-omic tumor data reveal diversity of molecular mechanisms that correlate with survival</article-title>. <source>Nat. Commun.</source> <volume>9</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41467-018-06921-8</pub-id>, PMID: <pub-id pub-id-type="pmid">30367051</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rao</surname> <given-names>Y.</given-names></name>
<name><surname>Zhao</surname> <given-names>W.</given-names></name>
<name><surname>Zhu</surname> <given-names>Z.</given-names></name>
<name><surname>Lu</surname> <given-names>J.</given-names></name>
<name><surname>Zhou</surname> <given-names>J.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Global filter networks for image classification</article-title>. <source>Neural Inf. Process. Syst</source>.
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Roy</surname> <given-names>S. K.</given-names></name>
<name><surname>Deria</surname> <given-names>A.</given-names></name>
<name><surname>Hong</surname> <given-names>D.</given-names></name>
<name><surname>Rasti</surname> <given-names>B.</given-names></name>
<name><surname>Plaza</surname> <given-names>A.</given-names></name>
<name><surname>Chanussot</surname> <given-names>J.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Multimodal fusion transformer for remote sensing image classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens</source>.
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Roy</surname> <given-names>S. K.</given-names></name>
<name><surname>Deria</surname> <given-names>A.</given-names></name>
<name><surname>Hong</surname> <given-names>D.</given-names></name>
<name><surname>Rasti</surname> <given-names>B.</given-names></name>
<name><surname>Plaza</surname> <given-names>A.</given-names></name>
<name><surname>Chanussot</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>a). 
<article-title>Multimodal fusion transformer for remote sensing image classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>61</volume>, <fpage>1</fpage>&#x2013;<lpage>20</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2023.3286826</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shakyawar</surname> <given-names>S. K.</given-names></name>
<name><surname>Sajja</surname> <given-names>B. R.</given-names></name>
<name><surname>Patel</surname> <given-names>J. C.</given-names></name>
<name><surname>Guda</surname> <given-names>C.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>i cluf: an unsupervised iterative cluster-fusion method for patient stratification using multiomics data</article-title>. <source>Bioinf. Adv.</source> <volume>4</volume>, <fpage>vbae015</fpage>., PMID: <pub-id pub-id-type="pmid">38698887</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Shakyawar</surname> <given-names>S. K.</given-names></name>
<name><surname>Sethi</surname> <given-names>S.</given-names></name>
<name><surname>Southekal</surname> <given-names>S.</given-names></name>
<name><surname>Mishra</surname> <given-names>N. K.</given-names></name>
<name><surname>Guda</surname> <given-names>C.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Big data analytics for modeling covid-19 and comorbidities: An unmet need</article-title>,&#x201d; in <source>Computational Intelligence Techniques for Combating COVID-19</source> (
<publisher-name>Springer</publisher-name>), <fpage>185</fpage>&#x2013;<lpage>201</lpage>. Available online at: <uri xlink:href="https://link.springer.com/chapter/10.1007/978-3-030-68936-0_10">https://link.springer.com/chapter/10.1007/978-3-030-68936-0_10</uri>.
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sun</surname> <given-names>L.</given-names></name>
<name><surname>Zhao</surname> <given-names>G.</given-names></name>
<name><surname>Zheng</surname> <given-names>Y.</given-names></name>
<name><surname>Wu</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Spectral&#x2013;spatial feature tokenization transformer for hyperspectral image classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2022.3144158</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Taori</surname> <given-names>R.</given-names></name>
<name><surname>Dave</surname> <given-names>A.</given-names></name>
<name><surname>Shankar</surname> <given-names>V.</given-names></name>
<name><surname>Carlini</surname> <given-names>N.</given-names></name>
<name><surname>Recht</surname> <given-names>B.</given-names></name>
<name><surname>Schmidt</surname> <given-names>L.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Measuring robustness to natural distribution shifts in image classification</article-title>. <source>Neural Inf. Process. Syst.</source>  doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2007.00644</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tian</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Krishnan</surname> <given-names>D.</given-names></name>
<name><surname>Tenenbaum</surname> <given-names>J.</given-names></name>
<name><surname>Isola</surname> <given-names>P.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Rethinking few-shot image classification: a good embedding is all you need</article-title>? <source>Eur. Conf. Comput. Vision</source><fpage>266</fpage>&#x2013;<lpage>282</lpage>.  doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-58568-6_16</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Touvron</surname> <given-names>H.</given-names></name>
<name><surname>Bojanowski</surname> <given-names>P.</given-names></name>
<name><surname>Caron</surname> <given-names>M.</given-names></name>
<name><surname>Cord</surname> <given-names>M.</given-names></name>
<name><surname>El-Nouby</surname> <given-names>A.</given-names></name>
<name><surname>Grave</surname> <given-names>E.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). 
<article-title>Resmlp: Feedforward networks for image classification with data-efficient training</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>45</volume> (<issue>4</issue>), <fpage>5314</fpage>&#x2013;<lpage>5321</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2022.3206148</pub-id>, PMID: <pub-id pub-id-type="pmid">36094972</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Yang</surname> <given-names>S.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>M.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<name><surname>Yang</surname> <given-names>W.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Transformer-based unsupervised contrastive learning for histopathological image classification</article-title>. <source>Med. Image Anal.</source> <volume>81</volume>, <elocation-id>102559</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.media.2022.102559</pub-id>, PMID: <pub-id pub-id-type="pmid">35952419</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Woo</surname> <given-names>S.</given-names></name>
<name><surname>Debnath</surname> <given-names>S.</given-names></name>
<name><surname>Hu</surname> <given-names>R.</given-names></name>
<name><surname>Chen</surname> <given-names>X.</given-names></name>
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Kweon</surname> <given-names>I. S.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>a). &#x201c;
<article-title>Convnext v2: Co-designing and scaling convnets with masked autoencoders</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <fpage>16133</fpage>&#x2013;<lpage>16142</lpage>. Available online at: <uri xlink:href="https://openaccess.thecvf.com/content/CVPR2023/html/Woo_ConvNeXt_V2_Co-Designing_and_Scaling_ConvNets_With_Masked_Autoencoders_CVPR_2023_paper.html">https://openaccess.thecvf.com/content/CVPR2023/html/Woo_ConvNeXt_V2_Co-Designing_and_Scaling_ConvNets_With_Masked_Autoencoders_CVPR_2023_paper.html</uri>.
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>H.</given-names></name>
<name><surname>Chen</surname> <given-names>R.</given-names></name>
<name><surname>Li</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2021</year>a). 
<article-title>Subtype-gan: a deep learning approach for integrative cancer subtyping of multi-omics data</article-title>. <source>Bioinformatics</source> <volume>37</volume>, <fpage>2231</fpage>&#x2013;<lpage>2237</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btab109</pub-id>, PMID: <pub-id pub-id-type="pmid">33599254</pub-id>
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>J.</given-names></name>
<name><surname>Shi</surname> <given-names>R.</given-names></name>
<name><surname>Wei</surname> <given-names>D.</given-names></name>
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Zhao</surname> <given-names>L.</given-names></name>
<name><surname>Ke</surname> <given-names>B.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>b). 
<article-title>Medmnist v2 - a large-scale lightweight benchmark for 2d and 3d biomedical image classification</article-title>. <source>Sci. Data</source> <volume>10</volume>, <fpage>41</fpage> doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41597-022-01721-8</pub-id>., PMID: <pub-id pub-id-type="pmid">36658144</pub-id>
</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yuan</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>T.</given-names></name>
<name><surname>Yu</surname> <given-names>W.</given-names></name>
<name><surname>Shi</surname> <given-names>Y.</given-names></name>
<name><surname>Jiang</surname> <given-names>Z.-H.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;
<article-title>Tokens-to-token vit: Training vision transformers from scratch on imagenet</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. <fpage>558</fpage>&#x2013;<lpage>567</lpage>. Available online at: <uri xlink:href="https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_Tokens-to-Token_ViT_Training_Vision_Transformers_From_Scratch_on_ImageNet_ICCV_2021_paper.html?ref=https://githubhelp.com">https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_Tokens-to-Token_ViT_Training_Vision_Transformers_From_Scratch_on_ImageNet_ICCV_2021_paper.html?ref=https://githubhelp.com</uri>.
</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>C.</given-names></name>
<name><surname>Cai</surname> <given-names>Y.</given-names></name>
<name><surname>Lin</surname> <given-names>G.</given-names></name>
<name><surname>Shen</surname> <given-names>C.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Deepemd: Few-shot image classification with differentiable earth mover&#x2019;s distance and structured classifiers</article-title>. <source>Comput. Vision Pattern Recognition</source><fpage>12200</fpage>&#x2013;<lpage>12210</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR42600.2020</pub-id>
</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>Z.</given-names></name>
<name><surname>Hu</surname> <given-names>S.</given-names></name>
<name><surname>Liu</surname> <given-names>H.</given-names></name>
<name><surname>Lai</surname> <given-names>F.</given-names></name>
<name><surname>Fan</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>a). 
<article-title>Assessment of the placental microbiota of preterm infants with pneumonia: a case control study</article-title>. <source>Front. Cell. Infection Microbiol.</source> <volume>15</volume>, <elocation-id>1511141</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fcimb.2025.1511141</pub-id>, PMID: <pub-id pub-id-type="pmid">40248365</pub-id>
</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>W.</given-names></name>
<name><surname>Sun</surname> <given-names>W.</given-names></name>
<name><surname>Tao</surname> <given-names>R.</given-names></name>
<name><surname>Du</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Single-source domain expansion network for cross-scene hyperspectral image classification</article-title>. <source>IEEE Trans. Image Process</source> <volume>32</volume>, <fpage>1498</fpage>&#x2013;<lpage>1512</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIP.2023.3243853</pub-id>, PMID: <pub-id pub-id-type="pmid">37027628</pub-id>
</mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>X.</given-names></name>
<name><surname>Sun</surname> <given-names>H.</given-names></name>
<name><surname>Lu</surname> <given-names>X.</given-names></name>
<name><surname>Xie</surname> <given-names>W.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Rotation-invariant attention network for hyperspectral image classification</article-title>. <source>IEEE Trans. Image Process</source> <volume>31</volume>, <fpage>4251</fpage>&#x2013;<lpage>4265</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIP.2022.3177322</pub-id>, PMID: <pub-id pub-id-type="pmid">35635815</pub-id>
</mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>Y.</given-names></name>
<name><surname>Zhuang</surname> <given-names>F.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Ke</surname> <given-names>G.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Bian</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>Deep subdomain adaptation network for image classification</article-title>. <source>IEEE Trans. Neural Networks Learn. Syst.</source> <volume>32</volume> (<issue>4</issue>), <fpage>1713</fpage>&#x2013;<lpage>1722</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TNNLS.2020.2988928</pub-id>, PMID: <pub-id pub-id-type="pmid">32365037</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/534029">Chandrajit Lahiri</ext-link>, Atmiya University, India</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1367770">Sushil Shakyawar</ext-link>, University of Nebraska Medical Center, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3045723">Shrikant Pawar</ext-link>, Claflin University, United States</p></fn>
</fn-group>
</back>
</article>