<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2026.1739716</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>MedCSS: a causal self-supervised approach for hierarchical feature consistency in 3D medical imaging</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Han</surname> <given-names>Jiang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn001"><sup>&#x02020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Wang</surname> <given-names>Fei</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn001"><sup>&#x02020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<uri xlink:href="https://loop.frontiersin.org/people/3277216"/>
</contrib>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes">
<name><surname>Shen</surname> <given-names>Xingchen</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<xref ref-type="author-notes" rid="fn001"><sup>&#x02020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/2824554"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Cao</surname> <given-names>Feng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/989799"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Institute of Geriatrics, Beijing Key Laboratory of Aging and Geriatrics, National Clinical Research Center for Geriatrics Diseases, Second Medical Center of Chinese PLA General Hospital</institution>, <city>Beijing</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Institute of Software, Chinese Academy of Sciences</institution>, <city>Beijing</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>University of Chinese Academy of Sciences</institution>, <city>Beijing</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Xingchen Shen, <email xlink:href="mailto:xingchen@iscas.ac.cn">xingchen@iscas.ac.cn</email>; Feng Cao, <email xlink:href="mailto:fengcao8828@163.com">fengcao8828@163.com</email></corresp>
<fn fn-type="equal" id="fn001"><label>&#x02020;</label><p>These authors have contributed equally to this work</p></fn></author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-16">
<day>16</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>20</volume>
<elocation-id>1739716</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>16</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Han, Wang, Shen and Cao.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Han, Wang, Shen and Cao</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-16">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Medical image analysis plays a crucial role in linking perceptual mechanisms with clinical diagnosis, yet conventional deep learning models often rely on statistical correlations rather than modeling the underlying generative structure, leading to limited robustness in small-sample and cross-domain scenarios. To address this issue, we propose a hierarchical feature consistency framework named &#x0201C;MedCSS&#x0201D; that integrates causal self-supervised learning. Built upon a 3D ResNet backbone, the method aligns intermediate and high-level features through distributional consistency while introducing a coding rate&#x02013;based causal regularization to suppress non-causal redundancy. Experiments on the MedMNIST3D benchmark demonstrate enhanced feature stability, boundary sensitivity, and generalization across diverse medical structures. Visualization analyses further reveal improved morphological coherence and causal interpretability. This study highlights the potential of causal self-supervision for structurally robust and semantically consistent representation learning in three-dimensional medical imaging.</p></abstract>
<kwd-group>
<kwd>causal self-supervision</kwd>
<kwd>deep learning</kwd>
<kwd>feature consistency</kwd>
<kwd>medical image analysis</kwd>
<kwd>representation learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the Beijing Natural Science Foundation (No. L241026) and the National Natural Science Foundation of China (No. 42506186).</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="5"/>
<equation-count count="11"/>
<ref-count count="27"/>
<page-count count="13"/>
<word-count count="6143"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Neuroscience Methods and Techniques</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Medical image analysis represents a crucial interdisciplinary field at the intersection of biomedical science and artificial intelligence research. Its core objectives extend beyond assisting clinical disease detection and diagnosis to encompass understanding how biological systems encode, transmit, and integrate perceptual information across multiple scales and modalities (<xref ref-type="bibr" rid="B17">Litjens et al., 2017</xref>; <xref ref-type="bibr" rid="B1">Anaya-Isaza et al., 2021</xref>). The morphological and dynamic changes in imaging signals reveal tissue structures and functional states, providing an objective foundation for exploring the neural mechanisms underlying perception, attention, and cognitive processing (<xref ref-type="bibr" rid="B8">Greenspan et al., 2016</xref>; <xref ref-type="bibr" rid="B25">Tajbakhsh et al., 2020</xref>). The widespread application of artificial intelligence technologies, particularly deep learning, enables models to automatically learn underlying patterns from large-scale medical images. This has driven the convergent development of intelligent diagnostic systems and perception modeling.</p>
<p>The introduction of deep learning has shifted medical image analysis from manual feature extraction to end-to-end representation learning (<xref ref-type="bibr" rid="B15">LeCun et al., 2015</xref>; <xref ref-type="bibr" rid="B23">Schmidhuber, 2015</xref>). Models based on convolutional neural networks (CNNs) have demonstrated outstanding performance in tasks such as organ segmentation, lesion detection, and image reconstruction (<xref ref-type="bibr" rid="B21">Ronneberger et al., 2015</xref>; <xref ref-type="bibr" rid="B6">&#x000C7;i&#x000E7;ek et al., 2016</xref>; <xref ref-type="bibr" rid="B14">Kamnitsas et al., 2017</xref>). However, these approaches also exhibit inherent limitations: First, they heavily rely on large-scale manually annotated data, which is costly to produce and prone to observer bias (<xref ref-type="bibr" rid="B7">Esteva et al., 2021</xref>); Second, models primarily establish representations through correlation learning, lacking characterization of the data generation process and structural causal mechanisms; finally, significant distribution differences across modalities and devices substantially impact model generalization performance (<xref ref-type="bibr" rid="B11">Huang and Chung, 2022</xref>). In medical video-based dynamic lesion monitoring tasks, existing models struggle to handle image degradation caused by adverse conditions such as noise, rain, and snow, failing to consistently deliver accurate diagnostic information (<xref ref-type="bibr" rid="B18">Liu J. et al., 2025</xref>). In scenarios involving dense lesion prediction&#x02014;such as brain tumors and polyps&#x02014;traditional hierarchical networks suffer from scale information loss and feature misalignment, directly reducing lesion boundary recognition accuracy and impacting clinical intervention decisions (<xref ref-type="bibr" rid="B19">Liu X. et al., 2025</xref>). Therefore, how to obtain representations with causal consistency and structural interpretability under limited annotation conditions represents a core scientific challenge currently facing medical artificial intelligence.</p>
<p>To alleviate reliance on labeled data, self-supervised learning has gradually emerged as a key method for medical image representation learning (<xref ref-type="bibr" rid="B3">Caron et al., 2020</xref>; <xref ref-type="bibr" rid="B5">Chen et al., 2020</xref>; <xref ref-type="bibr" rid="B9">Grill et al., 2020</xref>). Its fundamental approach involves designing proxy tasks&#x02014;such as contrastive learning, image reconstruction, and rotation prediction&#x02014;to extract high-level abstract features from unlabeled samples, thereby enhancing feature transferability and data utilization efficiency (<xref ref-type="bibr" rid="B9">Grill et al., 2020</xref>; <xref ref-type="bibr" rid="B2">Azizi et al., 2021</xref>). Against this research backdrop, the MedMNIST series of datasets (<xref ref-type="bibr" rid="B27">Yang et al., 2023</xref>) provides a standardized evaluation platform covering multimodal 2D and 3D tasks, offering a unified benchmark for comparing the performance of different self-supervised and semi-supervised models. However, traditional self-supervised models often capture only statistical commonalities without modeling the underlying generative causal relationships behind observations.</p>
<p>In recent years, the concept of causal learning has been introduced into self-supervised frameworks, giving rise to a new direction known as Causal Self-Supervised Learning (Causal SSL). This approach introduces causal consistency constraints, enabling models to not only learn feature correlations but also focus on generative dependencies and intervention invariance among different variables (<xref ref-type="bibr" rid="B24">Sch&#x000F6;lkopf et al., 2021</xref>). Counterfactual reasoning can effectively mitigate biases, such as the Counterfactual Bidirectional Co-Attention Transformer framework reducing multimodal integration bias through counterfactual scenarios (<xref ref-type="bibr" rid="B12">Ji et al., 2025</xref>; <xref ref-type="bibr" rid="B13">Jones et al., 2024</xref>)&#x00027; three-step framework also provides methods for bias identification and fair model development. The Undoing Memorization Mechanism (UMM) framework proposed by <xref ref-type="bibr" rid="B20">Qiang et al. (2025</xref>). reveals the memorization bias in self-supervised models and achieves causal stabilization of representations through hierarchical feature distribution alignment and coding rate constraints. Building upon this, the SEMI-CAVA model further embeds causal inference mechanisms into variational semi-supervised learning, enabling the identification of latent causal factors through latent variable intervention (<xref ref-type="bibr" rid="B22">Saha et al., 2025</xref>). These studies offer new perspectives on understanding structural representations in models, demonstrating the significant potential of causal constraints to enhance model generalization, interpretability, and cross-domain stability.</p>
<p>At the model architecture level, Residual Networks (ResNet) (<xref ref-type="bibr" rid="B10">He et al., 2016</xref>) have become a common backbone for learning representations in 3D medical images due to their efficient gradient propagation and feature reuse capabilities (<xref ref-type="bibr" rid="B4">Chen et al., 2021</xref>; <xref ref-type="bibr" rid="B16">Lee et al., 2025</xref>). ResNet variants of varying depths can capture multi-level features ranging from local textures to global semantics, providing a solid structural foundation for achieving cross-layer causal consistency learning. Building upon this, this paper proposes a hierarchical feature consistency learning framework based on a causal self-supervised mechanism. Using 3D ResNet as the backbone, it achieves cross-layer causal consistency through feature distribution alignment and coding rate regularization. Its effectiveness is validated on the MedMNIST3D dataset for nodule, vessel, and neural synapse classification tasks. This research not only demonstrates the feasibility of causal self-supervised learning in medical image analysis but also offers a novel computational perspective for understanding causal structure learning in artificial perception systems.The main contributions of this paper include:</p>
<list list-type="bullet">
<list-item><p>Proposing a 3D medical image classification framework that integrates causal self-supervision with hierarchical feature constraints, enabling the learning of causally consistent representations from unlabeled data;</p></list-item>
<list-item><p>Systematically validating its robustness and cross-domain generalization performance across multi-modal structural tasks using the MedMNIST3D dataset family;</p></list-item>
<list-item><p>Revealing the role of causal self-supervision in enhancing structural understanding and information compression from a perceptual cognition perspective, providing theoretical support for modeling biological perception mechanisms in artificial intelligence.</p></list-item>
</list></sec>
<sec id="s2">
<label>2</label>
<title>Methods</title>
<sec>
<label>2.1</label>
<title>Overview</title>
<p>Our study proposes a 3D Structural Representation Framework named &#x0201C;MedCSS&#x0201D; based on the Unsupervised Multi-task Matching (UMM) framework, aiming to simultaneously achieve robust feature learning across tissues and causally consistent modeling under limited annotation conditions. This approach targets diverse anatomical and pathological structures&#x02014;including pulmonary nodules, vascular branches, and neural synapses&#x02014;enabling the network to learn transferable 3D representations across varying tissue scales and imaging modalities. The overall architecture is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Overall model framework diagram of MedCSS.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1739716-g0001.tif">
<alt-text content-type="machine-generated">Flowchart displaying a 3D data enhancement pipeline with flipping and rotating, input into a 3D ResNet model. Layers one and two are frozen, while mid and final features proceed through a classification output layer. Coding rate loss and supervisory loss are combined for total loss calculation. An inset diagram details an internal block, showing a sequence including residual connection, convolution layers, global average pooling, two fully connected layers, ReLU, sigmoid, scaling, and output addition.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>2.2</label>
<title>Data preprocessing and augmentation</title>
<p>To enhance structural robustness and generalization, a dual-domain augmentation strategy was employed on each 3D image volume <italic>x</italic>&#x02208;&#x0211D;<sup>1 &#x000D7; <italic>D</italic>&#x000D7;<italic>H</italic>&#x000D7;<italic>W</italic></sup>. A stochastic transformation operator <inline-formula><mml:math id="M1"><mml:mrow><mml:mrow><mml:mi mathvariant="script">T</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> introduces random spatial and photometric perturbations while preserving structural topology:</p>
<disp-formula id="EQ1"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="script">T</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where <inline-formula><mml:math id="M3"><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> represents an augmented view used for self-supervised consistency learning. This unsupervised signal encourages the model to capture invariant structural semantics across spatial orientations.</p>
<p>During validation and testing, only linear intensity normalization is applied to ensure reproducible evaluation. To address severe class imbalance typical in medical datasets (e.g., between lesion and non-lesion regions or vessel types), we introduce a weighted sampling scheme, where the sampling probability for each sample <italic>i</italic> is defined as:</p>
<disp-formula id="EQ2"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn><mml:mo>/</mml:mo><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mn>1</mml:mn><mml:mo>/</mml:mo><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>with <italic>n</italic><sub><italic>c</italic><sub><italic>i</italic></sub></sub> denoting the number of samples in class <italic>c</italic><sub><italic>i</italic></sub>. This mechanism ensures balanced representation across both dominant and rare structural classes.</p>
</sec>
<sec>
<label>2.3</label>
<title>Network architecture</title>
<p>The backbone network adopts an enhanced 3D ResNet architecture as the feature encoder, augmented with Squeeze-and-Excitation (SE) channel attention to adaptively reweight discriminative channels.</p>
<p>The network begins with a 7 &#x000D7; 7 &#x000D7; 7 convolutional layer followed by four hierarchical residual stages. Layers 1&#x02013;2 capture low-level spatial and textural features, whereas layers 3&#x02013;4 encode high-level semantic representations.</p>
<p>We denote the intermediate and final feature maps as:</p>
<disp-formula id="EQ3"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">Layer3</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">Layer4</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>where <italic>f</italic><sub>3</sub> captures mid-level morphological patterns and <italic>f</italic><sub>4</sub> aggregates semantic information at a global scale. The final prediction output is given by:</p>
<disp-formula id="EQ4"><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x00177;</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>where &#x003C3;(&#x000B7;) denotes the activation function (Sigmoid for binary, Softmax for multi-class tasks). This layered hierarchy allows the model to generalize to multiple 3D biomedical tasks, such as vessel subtype recognition or synaptic boundary detection.</p>
</sec>
<sec>
<label>2.4</label>
<title>Self-supervised feature alignment</title>
<p>Inspired by the Unsupervised Multi-task Matching (UMM) framework, we introduce a cross-level feature alignment loss that enforces statistical coherence between intermediate and final representations. After global average pooling (GAP), both feature maps are projected into a shared latent space:</p>
<disp-formula id="EQ5"><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">GAP</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">GAP</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<disp-formula id="EQ6"><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003D5;</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003D5;</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>where &#x003D5;<sub>3</sub>(&#x000B7;) and &#x003D5;<sub>4</sub>(&#x000B7;) denote nonlinear projection heads. The alignment loss is computed as the Jensen&#x02013;Shannon divergence (JS) between the empirical distributions of <italic>h</italic><sub>3</sub> and <italic>h</italic><sub>4</sub>:</p>
<disp-formula id="EQ7"><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">align</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">JS</mml:mtext><mml:mstyle mathsize="1.19em"><mml:mrow><mml:mo>(</mml:mo></mml:mrow></mml:mstyle><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathsize="1.19em"><mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>This constraint encourages hierarchical consistency and invariance to data augmentations, allowing the model to capture structure-invariant semantics without reliance on explicit labels.</p>
</sec>
<sec>
<label>2.5</label>
<title>Causal regularization via coding rate reduction</title>
<p>To incorporate causal priors, we introduce a Coding Rate Reduction (CRR) constraint that simulates information compression along the feature hierarchy, reflecting the causal transmission from morphological cues to semantic abstractions.</p>
<p>Given feature matrix <italic>f</italic>&#x02208;&#x0211D;<sup><italic>B</italic>&#x000D7;<italic>d</italic></sup> with covariance matrix:</p>
<disp-formula id="EQ8"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003A3;</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:mfrac><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mo>-</mml:mo><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mo>-</mml:mo><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>where <inline-formula><mml:math id="M11"><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> is the batch-wise mean, the coding rate is defined as:</p>
<disp-formula id="EQ9"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>R</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo class="qopname">log</mml:mo><mml:mo class="qopname">det</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>I</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:msub><mml:mrow><mml:mi>&#x003A3;</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>with &#x003B1; controlling sensitivity to feature variance. Ideally, high-level features should exhibit lower redundancy, i.e., <italic>R</italic>(<italic>f</italic><sub>4</sub>) &#x0003C; <italic>R</italic>(<italic>f</italic><sub>3</sub>). Thus, the CRR loss is formulated as:</p>
<disp-formula id="EQ10"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">crr</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">ReLU</mml:mtext><mml:mstyle mathsize="1.19em"><mml:mrow><mml:mo>(</mml:mo></mml:mrow></mml:mstyle><mml:mi>R</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>R</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathsize="1.19em"><mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>This regularization penalizes information inflation in deeper layers, promoting compact, causally consistent feature representations. In vascular and neuronal imaging tasks, such constraint mitigates spurious correlations and enhances interpretability of learned representations.</p>
</sec>
<sec>
<label>2.6</label>
<title>Joint optimization objective</title>
<p>The overall training objective integrates supervised, self-supervised, and causal components:</p>
<disp-formula id="EQ11"><mml:math id="M14"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">sup</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">align</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">align</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">crr</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">crr</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>where <inline-formula><mml:math id="M15"><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">sup</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> denotes the supervised loss (e.g., weighted binary cross-entropy or Dice loss), and &#x003BB;<sub>align</sub>, &#x003BB;<sub>crr</sub> are balancing coefficients for the self-supervised and causal regularization terms, respectively.</p>
<p>This joint optimization strategy achieves collaborative feature learning across multiple tasks and cross-layer semantic unification by integrating supervised objectives, self-supervised distribution consistency, and causal regularization constraints. This significantly enhances the model&#x00027;s robustness and transferability in analyzing complex medical structures.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Result</title>
<sec>
<label>3.1</label>
<title>Overall classification performance</title>
<p>To systematically evaluate the effectiveness of our proposed method, we conducted experiments on three representative 3D medical image datasets: SynapseMNIST3D, VesselMNIST3D, and NoduleMNIST3D. Using ResNet10, ResNet18, and ResNet50 as backbone networks respectively, we compared the performance of the original models with that of our method (denoted as MedCSS) under identical training settings. Evaluation metrics included accuracy, recall, F1 score, and area under the ROC curve (AUC), with results shown in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Performance comparison of ResNet variants on three 3D datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center" colspan="4">SynapseMNIST3D</th>
<th valign="top" align="center" colspan="4">VesselMNIST3D</th>
<th valign="top" align="center" colspan="4">NoduleMNIST3D</th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ResNet10</td>
<td valign="top" align="center">0.7301</td>
<td valign="top" align="center"><bold>1.0000</bold></td>
<td valign="top" align="center">0.8440</td>
<td valign="top" align="center">0.7357</td>
<td valign="top" align="center">0.8927</td>
<td valign="top" align="center">0.3721</td>
<td valign="top" align="center">0.4384</td>
<td valign="top" align="center">0.6654</td>
<td valign="top" align="center">0.8129</td>
<td valign="top" align="center">0.6562</td>
<td valign="top" align="center">0.5915</td>
<td valign="top" align="center">0.7550</td>
</tr>
<tr>
<td valign="top" align="left">ResNet10&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.7528</bold></td>
<td valign="top" align="center">0.9494</td>
<td valign="top" align="center"><bold>0.8487</bold></td>
<td valign="top" align="center"><bold>0.7357</bold></td>
<td valign="top" align="center"><bold>0.9215</bold></td>
<td valign="top" align="center"><bold>0.6047</bold></td>
<td valign="top" align="center"><bold>0.6341</bold></td>
<td valign="top" align="center"><bold>0.9079</bold></td>
<td valign="top" align="center"><bold>0.8323</bold></td>
<td valign="top" align="center"><bold>0.7031</bold></td>
<td valign="top" align="center"><bold>0.6338</bold></td>
<td valign="top" align="center"><bold>0.8377</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet18</td>
<td valign="top" align="center">0.7358</td>
<td valign="top" align="center"><bold>0.9767</bold></td>
<td valign="top" align="center"><bold>0.8437</bold></td>
<td valign="top" align="center">0.5304</td>
<td valign="top" align="center">0.9241</td>
<td valign="top" align="center">0.5349</td>
<td valign="top" align="center">0.6133</td>
<td valign="top" align="center">0.7542</td>
<td valign="top" align="center">0.8226</td>
<td valign="top" align="center">0.4688</td>
<td valign="top" align="center">0.5217</td>
<td valign="top" align="center">0.6917</td>
</tr>
<tr>
<td valign="top" align="left">ResNet18&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.7386</bold></td>
<td valign="top" align="center">0.9572</td>
<td valign="top" align="center">0.8425</td>
<td valign="top" align="center"><bold>0.7085</bold></td>
<td valign="top" align="center"><bold>0.9372</bold></td>
<td valign="top" align="center"><bold>0.7209</bold></td>
<td valign="top" align="center"><bold>0.7209</bold></td>
<td valign="top" align="center"><bold>0.9483</bold></td>
<td valign="top" align="center"><bold>0.8806</bold></td>
<td valign="top" align="center"><bold>0.5938</bold></td>
<td valign="top" align="center"><bold>0.6726</bold></td>
<td valign="top" align="center"><bold>0.9170</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center">0.7386</td>
<td valign="top" align="center">0.8794</td>
<td valign="top" align="center">0.8309</td>
<td valign="top" align="center">0.6186</td>
<td valign="top" align="center">0.9241</td>
<td valign="top" align="center">0.5581</td>
<td valign="top" align="center">0.6234</td>
<td valign="top" align="center">0.7643</td>
<td valign="top" align="center">0.8581</td>
<td valign="top" align="center">0.6562</td>
<td valign="top" align="center">0.6562</td>
<td valign="top" align="center">0.7834</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.7528</bold></td>
<td valign="top" align="center"><bold>0.9689</bold></td>
<td valign="top" align="center"><bold>0.8498</bold></td>
<td valign="top" align="center"><bold>0.7069</bold></td>
<td valign="top" align="center"><bold>0.9319</bold></td>
<td valign="top" align="center"><bold>0.6279</bold></td>
<td valign="top" align="center"><bold>0.6750</bold></td>
<td valign="top" align="center"><bold>0.9090</bold></td>
<td valign="top" align="center"><bold>0.8677</bold></td>
<td valign="top" align="center"><bold>0.6719</bold></td>
<td valign="top" align="center"><bold>0.6772</bold></td>
<td valign="top" align="center"><bold>0.8694</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values denote the experimental data with the best performance in each column.</p>
</table-wrap-foot>
</table-wrap>
<p>Overall, integrating our method consistently improved performance across all ResNet variants on the three datasets, with the most significant gains observed in ROC AUC. Taking VesselMNIST3D as an example, ResNet18&#x00027;s AUC increased from 0.7542 to 0.9643, while ResNet50&#x00027;s AUC rose from 0.7483 to 0.9090. On NoduleMNIST3D, ResNet18&#x0002B;MedCSS achieved an AUC of 0.9734. This substantial AUC improvement demonstrates the model&#x00027;s enhanced classification stability and positive/negative sample discrimination capability across varying thresholds. Furthermore, the concurrent increases in accuracy, recall, and F1 score indicate that our approach not only elevates overall classification precision but also effectively improves category balance and false positive rate control. For instance, on VesselMNIST3D, ResNet10&#x0002B;MedCSS increased recall from 0.3721 to 0.6047 and F1 score from 0.4384 to 0.6141. This simultaneous enhancement of accuracy and recall highlights the method&#x00027;s significant advantage in addressing the imbalanced positive-negative sample distribution in medical imaging.</p>
</sec>
<sec>
<label>3.2</label>
<title>Prediction probability distribution characteristics</title>
<p>To further validate the essence of model performance improvement at the probability space level, we conducted statistical and visual analysis of the prediction probability distributions output by each model (with a threshold set at 0.5). The results are shown in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Prediction probability distributions of baseline and our models on three 3D medical datasets. <bold>(a)</bold> ResNet50 in SynapseMNIST3D. <bold>(b)</bold> ResNet18 in SynapseMNIST3D. <bold>(c)</bold> ResNet10 in SynapseMNIST3D. <bold>(d)</bold> ResNet50&#x0002B;MedCSS in SynapseMNIST3D. <bold>(e)</bold> ResNet18&#x0002B;MedCSS in SynapseMNIST3D. <bold>(f)</bold> ResNet10&#x0002B;MedCSS in SynapseMNIST3D. <bold>(g)</bold> ResNet50 in VesselMNIST3D. <bold>(h)</bold> ResNet18 in VesselMNIST3D. <bold>(i)</bold> ResNet10 in VesselMNIST3D. <bold>(j)</bold> ResNet50&#x0002B;MedCSS in VesselMNIST3D. <bold>(k)</bold> ResNet18&#x0002B;MedCSS in VesselMNIST3D. <bold>(l)</bold> ResNet10&#x0002B;MedCSS in VesselMNIST3D. <bold>(m)</bold> ResNet50 in NoduleMNIST3D. <bold>(n)</bold> ResNet18 in NoduleMNIST3D. <bold>(o)</bold> ResNet10 in NoduleMNIST3D. <bold>(p)</bold> ResNet50&#x0002B;MedCSS in NoduleMNIST3D. <bold>(q)</bold> ResNet18&#x0002B;MedCSS in NoduleMNIST3D. <bold>(r)</bold> ResNet10&#x0002B;MedCSS in NoduleMNIST3D.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1739716-g0002.tif">
<alt-text content-type="machine-generated">Sixteen violin plots in a four-by-four grid show predicted probability distributions for negative and positive classes across three datasets: SynapseMNIST3D, VesselMNIST3D, and NoduleMNIST3D. Rows compare ResNet models with and without MedCSS. Plots illustrate class predictions, threshold lines at zero point five, and individual data points for each configuration.</alt-text>
</graphic>
</fig>
<p>Comparing the original ResNet architecture with the architecture incorporating our method, a significant distribution reconstruction phenomenon can be observed: After incorporating our method, negative samples (green distribution) exhibit dense probabilities in the low-value range, far from the 0.5 threshold, while positive samples (orange distribution) concentrate in the high-value range. This bimodal distribution formation reflects a substantial enhancement in model confidence.</p>
<p>Taking the SynapseMNIST3D dataset as an example, the original ResNet50 predictions showed substantial overlap in the 0.4&#x02013;0.6 probability range. After incorporating our method, the probability distribution boundaries of positive and negative samples became distinctly separated, with model output confidence improving by approximately 15%. Similar patterns hold true for VesselMNIST3D and NoduleMNIST3D, indicating that this method enables the network to learn more discriminative feature spaces, making prediction outputs better aligned with the true distribution characteristics of the samples.</p>
</sec>
<sec>
<label>3.3</label>
<title>ROC curves and threshold sensitivity analysis</title>
<p>To comprehensively characterize the classification performance of models at different thresholds, we plotted the ROC curves for each model, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>ROC curves of baseline and our models on three 3D medical datasets. <bold>(a)</bold> ResNet50 in SynapseMNIST3D. <bold>(b)</bold> ResNet18 in SynapseMNIST3D. <bold>(c)</bold> ResNet10 in SynapseMNIST3D. <bold>(d)</bold> ResNet50&#x0002B;MedCSS in SynapseMNIST3D. <bold>(e)</bold> ResNet18&#x0002B;MedCSS in SynapseMNIST3D. <bold>(f)</bold> ResNet10&#x0002B;MedCSS in SynapseMNIST3D. <bold>(g)</bold> ResNet50 in VesselMNIST3D. <bold>(h)</bold> ResNet18 in VesselMNIST3D. <bold>(i)</bold> ResNet10 in VesselMNIST3D. <bold>(j)</bold> ResNet50&#x0002B;MedCSS in VesselMNIST3D. <bold>(k)</bold> ResNet18&#x0002B;MedCSS in VesselMNIST3D. <bold>(l)</bold> ResNet10&#x0002B;MedCSS in VesselMNIST3D. <bold>(m)</bold> ResNet50 in NoduleMNIST3D. <bold>(n)</bold> ResNet18 in NoduleMNIST3D. <bold>(o)</bold> ResNet10 in NoduleMNIST3D. <bold>(p)</bold> ResNet50&#x0002B;MedCSS in NoduleMNIST3D. <bold>(q)</bold> ResNet18&#x0002B;MedCSS in NoduleMNIST3D. <bold>(r)</bold> ResNet10&#x0002B;MedCSS in NoduleMNIST3D.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1739716-g0003.tif">
<alt-text content-type="machine-generated">Grid of eighteen ROC curve graphs compares ResNet50, ResNet18, and ResNet10 models, with and without MedCSS, across SynapseMNIST3D, VesselMNIST3D, and NoduleMNIST3D datasets. Curves visualize model performance, with corresponding AUC values indicated in each subplot.</alt-text>
</graphic>
</fig>
<p>Overall, the curves of models incorporating our method cluster closer to the upper-left corner, with significantly increased AUC values. This indicates that the models maintain lower false positive rates under high recall conditions. Taking ResNet50 on NoduleMNIST3D as an example, the original model&#x00027;s ROC curve exhibits a noticeable gap relative to the ideal curve (upper left corner). After incorporating our method, this gap narrows by approximately 35%, corresponding to an AUC improvement of about 0.25. This demonstrates that the proposed method maintains stable classification discrimination across different threshold conditions.</p>
</sec>
<sec>
<label>3.4</label>
<title>Cross-dataset transfer performance</title>
<p>To evaluate the generalization capability of our method across different domains, we conducted cross-dataset transfer experiments, with results shown in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Cross-dataset accuracy performance when models are trained on different datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center"><bold>Model</bold></th>
<th valign="top" align="center"><bold>VesselMNIST3D</bold></th>
<th valign="top" align="center"><bold>NoduleMNIST3D</bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#dee1e1;">
<td valign="top" align="left" colspan="3"><bold>(a) Trained on SynapseMNIST3D</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet10</td>
<td valign="top" align="center">0.1126</td>
<td valign="top" align="center">0.2677</td>
</tr>
<tr>
<td valign="top" align="left">ResNet10&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.1126</bold></td>
<td valign="top" align="center"><bold>0.5387</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet18</td>
<td valign="top" align="center">0.1126</td>
<td valign="top" align="center">0.4097</td>
</tr>
<tr>
<td valign="top" align="left">ResNet18&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.2408</bold></td>
<td valign="top" align="center"><bold>0.5032</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center">0.1126</td>
<td valign="top" align="center">0.2484</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.8822</bold></td>
<td valign="top" align="center"><bold>0.4548</bold></td>
</tr>
<tr style="background-color:#8f9496;color:#ffffff">
<td valign="top" align="left"><bold>Model</bold></td>
<td valign="top" align="center"><bold>SynapseMNIST3D</bold></td>
<td valign="top" align="center"><bold>VesselMNIST3D</bold></td>
</tr>
<tr style="background-color:#dee1e1;">
<td valign="top" align="left" colspan="3"><bold>(b) Trained on NoduleMNIST3D</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet10</td>
<td valign="top" align="center">0.2614</td>
<td valign="top" align="center">0.6885</td>
</tr>
<tr>
<td valign="top" align="left">ResNet10&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.7301</bold></td>
<td valign="top" align="center"><bold>0.8874</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet18</td>
<td valign="top" align="center">0.2784</td>
<td valign="top" align="center">0.8325</td>
</tr>
<tr>
<td valign="top" align="left">ResNet18&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.3182</bold></td>
<td valign="top" align="center"><bold>0.8874</bold></td>
</tr>
<tr>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center"><bold>0.5455</bold></td>
<td valign="top" align="center">0.8874</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50&#x0002B;MedCSS</td>
<td valign="top" align="center">0.4119</td>
<td valign="top" align="center"><bold>0.8874</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values denote the experimental data with the best performance in each column.</p>
</table-wrap-foot>
</table-wrap>
<p>It can be observed that when models trained on SynapseMNIST3D are transferred to VesselMNIST3D and NoduleMNIST3D, networks incorporating our method demonstrate significantly enhanced robustness. ResNet10&#x0002B;MedCSS achieved an accuracy improvement from 0.2677 to 0.5387 on NoduleMNIST3D, while ResNet50&#x0002B;MedCSS saw accuracy rise from 0.1126 to 0.8822 on VesselMNIST3D. When the training set was switched to NoduleMNIST3D and transferred to SynapseMNIST3D, ResNet10&#x0002B;MedCSS achieved an accuracy of 0.7301, up from 0.2614.</p>
<p>These results demonstrate that our method captures shared semantic features at the structural level across different tasks, effectively mitigating the impact of domain distribution differences. By leveraging self-supervised alignment and coding rate constraints, the approach promotes structural consistency in features, enabling the network to maintain stable discriminative capabilities across different imaging modalities and tissue types.</p>
</sec>
<sec>
<label>3.5</label>
<title>Validation of adaptability for mainstream self-supervised methods</title>
<p>To further validate the universality and applicability of the MedCSS framework&#x02014;specifically whether its core mechanism can effectively empower mainstream self-supervised learning frameworks beyond traditional ResNet-based supervised models&#x02014;this section adopts the classic contrastive learning method SimCLR as the base framework. Two sets of comparative experiments were designed: pure SimCLR and SimCLR integrated with the MedCSS core mechanism. The test results are shown in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Performance comparison of SimCLR variants on three 3D datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center"><bold>Method</bold></th>
<th valign="top" align="center" colspan="4">SynapseMNIST3D</th>
<th valign="top" align="center" colspan="4">VesselMNIST3D</th>
<th valign="top" align="center" colspan="4">NoduleMNIST3D</th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">SimCLR</td>
<td valign="top" align="center">0.8419</td>
<td valign="top" align="center">0.7500</td>
<td valign="top" align="center">0.6621</td>
<td valign="top" align="center">0.8831</td>
<td valign="top" align="center">0.9058</td>
<td valign="top" align="center">0.7209</td>
<td valign="top" align="center">0.6327</td>
<td valign="top" align="center">0.9297</td>
<td valign="top" align="center">0.2756</td>
<td valign="top" align="center">0.0078</td>
<td valign="top" align="center">0.0154</td>
<td valign="top" align="center">0.5002</td>
</tr>
<tr>
<td valign="top" align="left">SimCLR&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.7955</bold></td>
<td valign="top" align="center"><bold>0.9455</bold></td>
<td valign="top" align="center"><bold>0.8710</bold></td>
<td valign="top" align="center"><bold>0.8214</bold></td>
<td valign="top" align="center"><bold>0.9476</bold></td>
<td valign="top" align="center"><bold>0.8140</bold></td>
<td valign="top" align="center"><bold>0.7778</bold></td>
<td valign="top" align="center"><bold>0.9530</bold></td>
<td valign="top" align="center"><bold>0.8484</bold></td>
<td valign="top" align="center"><bold>0.7500</bold></td>
<td valign="top" align="center"><bold>0.6713</bold></td>
<td valign="top" align="center"><bold>0.8897</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values denote the experimental data with the best performance in each column.</p>
</table-wrap-foot>
</table-wrap>
<p>The experimental results collectively validate the universal value of the MedCSS framework: integrating its core mechanism into SimCLR achieves a qualitative leap in the model&#x00027;s representation capabilities across three major medical image datasets. For localized, non-tubular structure datasets like NoduleMNIST3D, pure SimCLR nearly fails due to its difficulty in capturing weakly correlated structural features, with core metrics at extremely low levels. However, after integrating MedCSS, the model&#x00027;s recognition accuracy surged from 0.2756 to 0.8484, recall rose from a nearly negligible 0.0078 to 0.7500, with AUC exceeding 0.88, completely resolving the shortcomings of traditional contrastive learning in representing complex local structures. On the VesselMNIST3D and SynapseMNIST3D datasets, the F1 scores improved by 14.51 and 20.89 percentage points respectively, with AUC consistently above 0.82. These results demonstrate that MedCSS can serve as a universal enhancement module seamlessly integrated into classic self-supervised frameworks like SimCLR, leveraging causal constraints to overcome inherent limitations of traditional contrastive learning.</p>
</sec>
<sec>
<label>3.6</label>
<title>Performance benchmarking of cutting-edge self-supervised methods</title>
<p>To further validate the competitiveness of the MedCSS framework among contemporary state-of-the-art self-supervised methods, this section selects 3DINO (<xref ref-type="bibr" rid="B26">Xu et al., 2025</xref>) as the benchmark for comprehensive performance comparison against SimCLR&#x0002B;MedCSS and ResNet50&#x0002B;MedCSS. Results are presented in <xref ref-type="table" rid="T4">Table 4</xref>.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Performance comparison of 3DINO and MedCSS-enhanced models on three 3D datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center"><bold>Method</bold></th>
<th valign="top" align="center" colspan="4">SynapseMNIST3D</th>
<th valign="top" align="center" colspan="4">VesselMNIST3D</th>
<th valign="top" align="center" colspan="4">NoduleMNIST3D</th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">3DINO</td>
<td valign="top" align="center">0.7358</td>
<td valign="top" align="center"><bold>0.9883</bold></td>
<td valign="top" align="center">0.8453</td>
<td valign="top" align="center">0.7150</td>
<td valign="top" align="center">0.8115</td>
<td valign="top" align="center">0.4651</td>
<td valign="top" align="center">0.3571</td>
<td valign="top" align="center">0.7791</td>
<td valign="top" align="center">0.6774</td>
<td valign="top" align="center"><bold>0.7500</bold></td>
<td valign="top" align="center">0.4898</td>
<td valign="top" align="center">0.7931</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50&#x0002B;MedCSS</td>
<td valign="top" align="center">0.7528</td>
<td valign="top" align="center">0.9689</td>
<td valign="top" align="center">0.8498</td>
<td valign="top" align="center">0.7069</td>
<td valign="top" align="center">0.9319</td>
<td valign="top" align="center">0.6279</td>
<td valign="top" align="center">0.6750</td>
<td valign="top" align="center">0.9090</td>
<td valign="top" align="center"><bold>0.8677</bold></td>
<td valign="top" align="center">0.6719</td>
<td valign="top" align="center"><bold>0.6772</bold></td>
<td valign="top" align="center">0.8694</td>
</tr>
<tr>
<td valign="top" align="left">SimCLR&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.7955</bold></td>
<td valign="top" align="center">0.9455</td>
<td valign="top" align="center"><bold>0.8710</bold></td>
<td valign="top" align="center"><bold>0.8214</bold></td>
<td valign="top" align="center"><bold>0.9476</bold></td>
<td valign="top" align="center"><bold>0.8140</bold></td>
<td valign="top" align="center"><bold>0.7778</bold></td>
<td valign="top" align="center"><bold>0.9530</bold></td>
<td valign="top" align="center">0.8484</td>
<td valign="top" align="center"><bold>0.7500</bold></td>
<td valign="top" align="center">0.6713</td>
<td valign="top" align="center"><bold>0.8897</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values denote the experimental data with the best performance in each column.</p>
</table-wrap-foot>
</table-wrap>
<p>Compared to the cutting-edge 3DINO model, the MedCSS series demonstrates overwhelming performance advantages. Overall, despite leveraging cutting-edge self-supervised paradigms, 3DINO exhibits significant adaptability limitations in medical structure recognition. On the VesselMNIST3D dataset, its core metrics lag substantially behind. In contrast, the MedCSS series models, leveraging causal constraints and hierarchical alignment mechanisms, achieve an AUC over 17 percentage points higher than 3DINO on this dataset. with F1 scores surging by over 42 percentage points, achieving a qualitative leap in accuracy and stability for tubular structure recognition. On the NoduleMNIST3D dataset, 3DINO&#x00027;s accuracy and AUC remained at mid-to-low levels, while MedCSS models saw accuracy improvements exceeding 17 percentage points. Overall, through innovative core mechanism design, the MedCSS framework comprehensively outperforms state-of-the-art self-supervised models like 3DINO across three typical 3D medical image datasets, validating its core competitiveness in medical image representation learning.</p>
</sec>
<sec>
<label>3.7</label>
<title>Dissolution experiment</title>
<p>To clarify the independent effects and synergistic value of the two core mechanisms&#x02014;&#x0201C;hierarchical feature alignment&#x0201D; and &#x0201C;causal coding rate regularization&#x0201D;&#x02014;within the MedCSS framework, this section designs three sets of controlled experiments: retaining only hierarchical feature alignment (Naive-Align), Coding Rate Regularization Only (CR-Only), and Dual-Mechanism Synergy (Full MedCSS), followed by fair validation against the ResNet baseline model across three major datasets. All experiments maintained consistency in backbone networks, training strategies, and evaluation metrics to precisely pinpoint the functional contributions of each component. Results are presented in <xref ref-type="table" rid="T5">Table 5</xref>.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Ablation experiment results of core components on three 3D datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center"><bold>Method</bold></th>
<th valign="top" align="center" colspan="4">SynapseMNIST3D</th>
<th valign="top" align="center" colspan="4">VesselMNIST3D</th>
<th valign="top" align="center" colspan="4">NoduleMNIST3D</th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
<th valign="top" align="center"><bold>ACC</bold></th>
<th valign="top" align="center"><bold>RECALL</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>ROC AUC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center">0.7386</td>
<td valign="top" align="center">0.8794</td>
<td valign="top" align="center">0.8309</td>
<td valign="top" align="center">0.6186</td>
<td valign="top" align="center">0.7643</td>
<td valign="top" align="center">0.8581</td>
<td valign="top" align="center">0.6562</td>
<td valign="top" align="center">0.7483</td>
<td valign="top" align="center">0.8581</td>
<td valign="top" align="center">0.6562</td>
<td valign="top" align="center">0.6562</td>
<td valign="top" align="center">0.7834</td>
</tr>
<tr>
<td valign="top" align="left">Naive-Align</td>
<td valign="top" align="center">0.3949</td>
<td valign="top" align="center">0.2529</td>
<td valign="top" align="center">0.3790</td>
<td valign="top" align="center">0.6214</td>
<td valign="top" align="center">0.1126</td>
<td valign="top" align="center">1.0000</td>
<td valign="top" align="center">0.2024</td>
<td valign="top" align="center">0.7001</td>
<td valign="top" align="center">0.7677</td>
<td valign="top" align="center">0.0469</td>
<td valign="top" align="center">0.0769</td>
<td valign="top" align="center">0.5205</td>
</tr>
<tr>
<td valign="top" align="left">CR-Only</td>
<td valign="top" align="center">0.5540</td>
<td valign="top" align="center">0.4630</td>
<td valign="top" align="center">0.6025</td>
<td valign="top" align="center">0.6769</td>
<td valign="top" align="center">0.8377</td>
<td valign="top" align="center">0.8837</td>
<td valign="top" align="center">0.5507</td>
<td valign="top" align="center">0.9253</td>
<td valign="top" align="center">0.7935</td>
<td valign="top" align="center">0.0000</td>
<td valign="top" align="center">0.0000</td>
<td valign="top" align="center">0.5227</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50&#x0002B;MedCSS</td>
<td valign="top" align="center"><bold>0.7528</bold></td>
<td valign="top" align="center"><bold>0.9689</bold></td>
<td valign="top" align="center"><bold>0.8498</bold></td>
<td valign="top" align="center"><bold>0.7069</bold></td>
<td valign="top" align="center"><bold>0.9090</bold></td>
<td valign="top" align="center"><bold>0.8677</bold></td>
<td valign="top" align="center"><bold>0.7591</bold></td>
<td valign="top" align="center"><bold>0.9643</bold></td>
<td valign="top" align="center"><bold>0.8677</bold></td>
<td valign="top" align="center"><bold>0.6719</bold></td>
<td valign="top" align="center"><bold>0.6745</bold></td>
<td valign="top" align="center"><bold>0.9734</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values denote the experimental data with the best performance in each column.</p>
</table-wrap-foot>
</table-wrap>
<p>The results clearly demonstrate the necessity of dual-mechanism synergy, as reliance on either component alone fails to achieve stable medical image representation learning. When only hierarchical feature alignment is retained, accuracy on VesselMNIST3D drops to 0.1126, and recall on NoduleMNIST3D falls below 5%, rendering it inadequate for identifying complex medical structures. When only encoding rate regularization was retained, the model completely failed on the NoduleMNIST3D task (F1 score of 0), and SynapseMNIST3D performance was also significantly below baseline. In contrast, the complete MedCSS model with dual-mechanism synergy achieves comprehensive performance breakthroughs across all three datasets. This fully demonstrates that the two core mechanisms are complementary and indispensable, jointly constructing a structurally robust and semantically explicit 3D medical image representation system.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Analysis and discussion</title>
<sec>
<label>4.1</label>
<title>Feature alignment and morphological consistency</title>
<p>In the task of learning representations for complex 3D medical images, models often struggle to capture stable, generalizable structural features due to the absence of causal consistency constraints. This study proposes a learning framework integrating self-supervised feature alignment with causal coding rate constraints. It aims to enhance the model&#x00027;s ability to understand and separate critical medical structures through hierarchical distribution consistency and information compression mechanisms. The framework achieves significant performance improvements across multiple 3D datasets, validating the effective synergy between self-supervision and causal constraints. This provides a novel technical pathway for stable recognition of three-dimensional pathological structures.</p>
<p>From a mechanism perspective, the self-supervised feature alignment module achieves statistical consistency across hierarchical features by imposing Jensen&#x02013;Shannon divergence constraints between intermediate and high-level features. This consistency enables the network to maintain sensitivity to spatial geometric relationships during semantic abstraction. As shown in <xref ref-type="fig" rid="F4">Figure 4</xref>, on the SynapseMNIST3D dataset, synapse regions exhibit approximate grayscale values and blurred boundaries, posing challenges for stable recognition by traditional models. After introducing feature alignment, the model preserves structural edge responses in intermediate layers while maintaining this morphological distribution in high-level semantic spaces. This enhancement elevates the ROC AUC from 0.6186 to 0.7069. Visualization results demonstrate that the model generates more continuous synaptic contours in enhanced feature maps, indicating that the self-supervised mechanism enhances morphological consistency across layers. This enables the network to maintain self-correction capabilities even in the absence of labeled data.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>SynapseMNIST3D dataset sample example.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1739716-g0004.tif">
<alt-text content-type="machine-generated">Black and white illustration showing a grid of uniformly spaced, irregular quadrilateral shapes arranged in rows and columns against a black background, creating a geometric and repetitive pattern.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>4.2</label>
<title>Causal coding rate constraint and information compression</title>
<p>Concurrently, the causal coding rate constraint constructs a cross-layer information compression mechanism through the determinant difference of the feature covariance matrix, effectively suppressing non-causal noise. Specifically, this constraint requires higher-level features to maintain a lower coding rate than intermediate layers, prompting the model to automatically filter redundant information during abstraction while preserving causally relevant structural signals. This mechanism demonstrates particularly strong performance on the VesselMNIST3D dataset. As shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, vascular samples exhibit complex branching and scale variations. Traditional networks often get trapped in local responses to high-frequency noise. However, after incorporating the coding rate constraint, the model progressively reduces channel variance associated with artifacts, focusing more on the main axis and connectivity features of blood vessels. Experimental results demonstrate that ResNet18&#x00027;s AUC improves from 0.7542 to 0.9643, with an F1 score increase of nearly 40%, indicating that this constraint endows the model with causal abstraction capabilities under information compression. Visualization results further confirm that the model exhibits significant focus on the main trunk regions of blood vessels in high-level features, while noise branches are markedly suppressed.The advantages of causal coding rate constraints are particularly evident when benchmarking against state-of-the-art self-supervised models: 3DINO achieves an AUC of only 0.7791 and an F1 score of 0.3571 on the VesselMNIST3D dataset. However, the SimCLR&#x0002B;MedCSS model incorporating MedCSS coding rate constraints elevates the AUC to 0.9530 and and the F1 score increased by 42.07 percentage points. This fully demonstrates that the combination of information compression and causal constraints significantly outperforms the traditional noise suppression paradigm of self-supervised learning.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>VesselMNIST3D dataset sample example.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1739716-g0005.tif">
<alt-text content-type="machine-generated">Grid of one hundred black cells each containing a unique irregular white abstract shape outlined by faint dashed lines, likely illustrating shape segmentation or mask samples for a computer vision or image analysis task.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>4.3</label>
<title>The mechanism of dual-component synergy is validated</title>
<p>Ablation experiments conducted on the ResNet50 backbone network further dissected the independent effects and synergistic value of the two core components within the MedCSS framework: &#x0201C;hierarchical feature alignment&#x0201D; and &#x0201C;causal coding rate regularization.&#x0201D; The experimental conclusions clearly indicate that these two core components mutually support each other and are indispensable. Relying solely on a single component cannot achieve stable feature learning for 3D medical images; only their synergistic collaboration can fully unleash the model&#x00027;s performance potential.</p>
<p>Regarding the individual effects of each component: When only hierarchical feature alignment is retained, the model maintains formal consistency in cross-layer feature distribution. However, lacking a mechanism to filter non-causal noise, it becomes prone to false correlations in structural features. This significantly degrades the model&#x00027;s ability to recognize complex medical structures, leading to critical structure omissions and feature distortions. When only encoding rate regularization is retained, while information compression effectively suppresses noise interference, it disrupts the semantic coherence of hierarchical features. This causes the high-level semantic space to lose its anchoring capability for low-level structural features, ultimately triggering semantic fragmentation and preventing the formation of a complete medical structure representation.</p>
<p>The integrated MedCSS model, however, achieves synergistic performance by combining both components. Hierarchical feature alignment provides stable structural anchors for cross-layer semantics, while causal coding rate regularization precisely filters redundant information. This synergy not only compensates for the inherent limitations of individual components but also ensures the model&#x00027;s precise capture of medical structural patterns while maintaining information purity during semantic abstraction. This is the core reason MedCSS consistently demonstrates performance advantages across diverse datasets and foundational frameworks.</p>
</sec>
<sec>
<label>4.4</label>
<title>Cross-domain generalization capability and anomalous results</title>
<p>More importantly, this method demonstrates robust generalization capabilities in cross-domain transfer experiments. When trained on SynapseMNIST3D and transferred to VesselMNIST3D, ResNet50&#x00027;s accuracy improved from 0.1126 to 0.8822; When trained on NoduleMNIST3D and transferred to SynapseMNIST3D, ResNet10&#x00027;s accuracy improved from 0.2614 to 0.7301. The results demonstrate that self-supervised alignment ensures continuity in the semantic space, while the causal encoding rate constraint mitigates domain-specific bias, enabling models to share stable representational foundations across different structural tasks. This cross-domain consistency indicates that models no longer rely on a single data distribution but instead learn intrinsic representations closer to physiological structure generation mechanisms, thereby exhibiting stronger generalizability in real clinical data transfer.</p>
<p>Despite encouraging overall results, localized anomalies emerged in select experiments. For instance, on the NoduleMNIST3D dataset, ResNet50 combined with our method achieved slightly lower AUC than the baseline. This anomaly primarily stems from excessive causal compression: under conditions of limited samples and high inter-class variance, the coding rate constraint prematurely discarded weakly correlated features, compromising marginal semantic information. This finding suggests that causal regularization requires dynamic adjustment mechanisms in small-sample scenarios. Such mechanisms should adapt compression intensity based on feature variance or task complexity, balancing information sparsity with semantic integrity.</p>
</sec>
<sec>
<label>4.5</label>
<title>Future direction</title>
<p>Future research will expand to more complex clinical datasets, exploring multimodal data fusion and collaborative mechanisms with large language models (LLMs), while adapting to small-sample scenarios through dynamic causal regularization. This approach holds significant medical value in cross-center data transfer and small-sample lesion detection, reducing diagnostic bias caused by dataset skewness. It provides robust imaging representation support for clinical practice, advancing the implementation of precision medicine.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>Overall, this study validates the unique advantages of causal self-supervision in three-dimensional medical imaging tasks. By integrating unsupervised feature alignment with information-theoretic encoding rate constraints, the model achieves significant improvements in feature stability, boundary sensitivity, and cross-domain generalization. However, the method remains subject to inherent limitations of deep learning frameworks, such as dependence on annotation accuracy and constraints on training dataset size. Future research will further integrate multi-center, multi-modal data and explore cross-layer intervention-enabled inference mechanisms based on graph causal models. This will advance three-dimensional causal self-supervision from statistical representation toward interpretable learning, providing more robust technical support for clinical lesion detection and structural visualization.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>JH: Visualization, Investigation, Writing &#x02013; original draft. FW: Writing &#x02013; original draft, Visualization, Formal analysis. XS: Methodology, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. FC: Supervision, Project administration, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<ack><title>Acknowledgments</title><p>The numerical calculations in this study were carried out on the ORISE Supercomputer (DFZX202416).</p></ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Anaya-Isaza</surname> <given-names>A.</given-names></name> <name><surname>Mera-Jim&#x000E9;nez</surname> <given-names>L.</given-names></name> <name><surname>Zequera-Diaz</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>An overview of deep learning in medical imaging</article-title>. <source>Inform. Med. Unlocked</source> <volume>26</volume>:<fpage>100723</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.imu.2021.100723</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Azizi</surname> <given-names>S.</given-names></name> <name><surname>Mustafa</surname> <given-names>B.</given-names></name> <name><surname>Ryan</surname> <given-names>F.</given-names></name> <name><surname>Beaver</surname> <given-names>Z.</given-names></name> <name><surname>Freyberg</surname> <given-names>J.</given-names></name> <name><surname>Deaton</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Big self-supervised models advance medical image classification,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source>, <fpage>3478</fpage>&#x02013;<lpage>3488</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00346</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Caron</surname> <given-names>M.</given-names></name> <name><surname>Misra</surname> <given-names>I.</given-names></name> <name><surname>Mairal</surname> <given-names>J.</given-names></name> <name><surname>Goyal</surname> <given-names>P.</given-names></name> <name><surname>Bojanowski</surname> <given-names>P.</given-names></name> <name><surname>Joulin</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Unsupervised learning of visual features by contrasting cluster assignments,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, <fpage>9912</fpage>&#x02013;<lpage>9924</lpage>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Lu</surname> <given-names>Y.</given-names></name> <name><surname>Yu</surname> <given-names>Q.</given-names></name> <name><surname>Luo</surname> <given-names>X.</given-names></name> <name><surname>Adeli</surname> <given-names>E.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Transunet: transformers make strong encoders for medical image segmentation</article-title>. <source>arXiv preprint arXiv:2102.04306</source>.</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Kornblith</surname> <given-names>S.</given-names></name> <name><surname>Norouzi</surname> <given-names>M.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;A simple framework for contrastive learning of visual representations,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>PmLR</publisher-loc>), <fpage>1597</fpage>&#x02013;<lpage>1607</lpage>.</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>&#x000C7;i&#x000E7;ek</surname> <given-names>&#x000D6;.</given-names></name> <name><surname>Abdulkadir</surname> <given-names>A.</given-names></name> <name><surname>Lienkamp</surname> <given-names>S. S.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name> <name><surname>Ronneberger</surname> <given-names>O.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;3D u-NET: learning dense volumetric segmentation from sparse annotation,&#x0201D;</article-title> in <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source> (<publisher-loc>Springer</publisher-loc>), <fpage>424</fpage>&#x02013;<lpage>432</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-46723-8_49</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Esteva</surname> <given-names>A.</given-names></name> <name><surname>Chou</surname> <given-names>K.</given-names></name> <name><surname>Yeung</surname> <given-names>S.</given-names></name> <name><surname>Naik</surname> <given-names>N.</given-names></name> <name><surname>Madani</surname> <given-names>A.</given-names></name> <name><surname>Mottaghi</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Deep learning-enabled medical computer vision</article-title>. <source>NPJ Dig. Med</source>. <volume>4</volume>:<fpage>5</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-020-00376-2</pub-id><pub-id pub-id-type="pmid">33420381</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Greenspan</surname> <given-names>H.</given-names></name> <name><surname>Van Ginneken</surname> <given-names>B.</given-names></name> <name><surname>Summers</surname> <given-names>R. M.</given-names></name></person-group> (<year>2016</year>). <article-title>Guest editorial deep learning in medical imaging: overview and future promise of an exciting new technique</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>35</volume>, <fpage>1153</fpage>&#x02013;<lpage>1159</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2016.2553401</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Grill</surname> <given-names>J.-B.</given-names></name> <name><surname>Strub</surname> <given-names>F.</given-names></name> <name><surname>Altch&#x000E9;</surname> <given-names>F.</given-names></name> <name><surname>Tallec</surname> <given-names>C.</given-names></name> <name><surname>Richemond</surname> <given-names>P.</given-names></name> <name><surname>Buchatskaya</surname> <given-names>E.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Bootstrap your own latent-a new approach to self-supervised learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, <fpage>21271</fpage>&#x02013;<lpage>21284</lpage>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Deep residual learning for image recognition,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, <fpage>770</fpage>&#x02013;<lpage>778</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Chung</surname> <given-names>A. C.</given-names></name></person-group> (<year>2022</year>). <article-title>Disease prediction with edge-variational graph convolutional networks</article-title>. <source>Med. Image Anal</source>. <volume>77</volume>:<fpage>102375</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2022.102375</pub-id><pub-id pub-id-type="pmid">35144198</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ji</surname> <given-names>Z.</given-names></name> <name><surname>Ge</surname> <given-names>Y.</given-names></name> <name><surname>Chukwudi</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>S. M.</given-names></name> <name><surname>Peng</surname> <given-names>Y.</given-names></name> <name><surname>Zhu</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Counterfactual bidirectional co-attention transformer for integrative histology-genomic cancer risk stratification</article-title>. <source>IEEE J. Biomed. Health Inform</source>. <volume>29</volume>, <fpage>5862</fpage>&#x02013;<lpage>5874</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JBHI.2025.3548048</pub-id><pub-id pub-id-type="pmid">40042950</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jones</surname> <given-names>C.</given-names></name> <name><surname>Castro</surname> <given-names>D. C.</given-names></name> <name><surname>De Sousa Ribeiro</surname> <given-names>F.</given-names></name> <name><surname>Oktay</surname> <given-names>O.</given-names></name> <name><surname>McCradden</surname> <given-names>M.</given-names></name> <name><surname>Glocker</surname> <given-names>B.</given-names></name></person-group> (<year>2024</year>). <article-title>A causal perspective on dataset bias in machine learning for medical imaging</article-title>. <source>Nat. Mach. Intell</source>. <volume>6</volume>, <fpage>138</fpage>&#x02013;<lpage>146</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s42256-024-00797-8</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kamnitsas</surname> <given-names>K.</given-names></name> <name><surname>Ledig</surname> <given-names>C.</given-names></name> <name><surname>Newcombe</surname> <given-names>V. F.</given-names></name> <name><surname>Simpson</surname> <given-names>J. P.</given-names></name> <name><surname>Kane</surname> <given-names>A. D.</given-names></name> <name><surname>Menon</surname> <given-names>D. K.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Efficient multi-scale 3D CNN with fully connected crf for accurate brain lesion segmentation</article-title>. <source>Med. Image Anal</source>. <volume>36</volume>, <fpage>61</fpage>&#x02013;<lpage>78</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2016.10.004</pub-id><pub-id pub-id-type="pmid">27865153</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2015</year>). <article-title>Deep learning</article-title>. <source>Nature</source> <volume>521</volume>, <fpage>436</fpage>&#x02013;<lpage>444</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nature14539</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Seo</surname> <given-names>M.</given-names></name> <name><surname>Park</surname> <given-names>S.</given-names></name> <name><surname>Imrus</surname> <given-names>S.</given-names></name> <name><surname>Ashok</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Enhancing free-hand 3D photoacoustic and ultrasound reconstruction using deep learning</article-title>. <source>IEEE Trans. Med. Imag</source>. <volume>44</volume>, <fpage>4652</fpage>&#x02013;<lpage>4665</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2025.3579454</pub-id><pub-id pub-id-type="pmid">40512645</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Litjens</surname> <given-names>G.</given-names></name> <name><surname>Kooi</surname> <given-names>T.</given-names></name> <name><surname>Bejnordi</surname> <given-names>B. E.</given-names></name> <name><surname>Setio</surname> <given-names>A. A. A.</given-names></name> <name><surname>Ciompi</surname> <given-names>F.</given-names></name> <name><surname>Ghafoorian</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>A survey on deep learning in medical image analysis</article-title>. <source>Med. Image Anal</source>. <volume>42</volume>, <fpage>60</fpage>&#x02013;<lpage>88</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2017.07.005</pub-id><pub-id pub-id-type="pmid">28778026</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Yang</surname> <given-names>S.</given-names></name> <name><surname>Xiang</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Zhao</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Towards general-purpose video reconstruction through synergy of grid-splicing diffusion and large language models</article-title>. <source>IEEE Trans. Circ. Syst. Video Technol</source>. <volume>2025</volume>:<fpage>3545795</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TCSVT.2025.3545795</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Liang</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Qian</surname> <given-names>Z.</given-names></name> <name><surname>Xing</surname> <given-names>P.</given-names></name> <name><surname>Chen</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Advancing hierarchical neural networks with scale-aware pyramidal feature learning for medical image dense prediction</article-title>. <source>Comput. Methods Progr. Biomed</source>. <volume>265</volume>:<fpage>108705</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cmpb.2025.108705</pub-id><pub-id pub-id-type="pmid">40184852</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Qiang</surname> <given-names>W.</given-names></name> <name><surname>Song</surname> <given-names>Z.</given-names></name> <name><surname>Gu</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Zheng</surname> <given-names>C.</given-names></name> <name><surname>Sun</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>On the generalization and causal explanation in self-supervised learning</article-title>. <source>Int. J. Comput. Vis</source>. <volume>133</volume>, <fpage>1727</fpage>&#x02013;<lpage>1754</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-024-02263-9</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O.</given-names></name> <name><surname>Fischer</surname> <given-names>P.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;U-net: convolutional networks for biomedical image segmentation,&#x0201D;</article-title> in <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source> (<publisher-loc>Springer</publisher-loc>), <fpage>234</fpage>&#x02013;<lpage>241</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saha</surname> <given-names>S.</given-names></name> <name><surname>Sahoo</surname> <given-names>P. K.</given-names></name> <name><surname>Garain</surname> <given-names>U.</given-names></name></person-group> (<year>2025</year>). <article-title>Semi-cava: a causal variational approach to semi-supervised learning</article-title>. <source>IEEE Trans. Patt. Anal. Mach. Intell</source>. <volume>44</volume>, <fpage>10022</fpage>&#x02013;<lpage>10032</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2025.3594360</pub-id><pub-id pub-id-type="pmid">40742852</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schmidhuber</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Deep learning in neural networks: an overview</article-title>. <source>Neural Netw</source>. <volume>61</volume>, <fpage>85</fpage>&#x02013;<lpage>117</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2014.09.003</pub-id><pub-id pub-id-type="pmid">25462637</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sch&#x000F6;lkopf</surname> <given-names>B.</given-names></name> <name><surname>Locatello</surname> <given-names>F.</given-names></name> <name><surname>Bauer</surname> <given-names>S.</given-names></name> <name><surname>Ke</surname> <given-names>N. R.</given-names></name> <name><surname>Kalchbrenner</surname> <given-names>N.</given-names></name> <name><surname>Goyal</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Toward causal representation learning</article-title>. <source>Proc. IEEE</source> <volume>109</volume>, <fpage>612</fpage>&#x02013;<lpage>634</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JPROC.2021.3058954</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tajbakhsh</surname> <given-names>N.</given-names></name> <name><surname>Jeyaseelan</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name> <name><surname>Chiang</surname> <given-names>J. N.</given-names></name> <name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Ding</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>Embracing imperfect datasets: a review of deep learning solutions for medical image segmentation</article-title>. <source>Med. Image Anal</source>. <volume>63</volume>:<fpage>101693</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2020.101693</pub-id><pub-id pub-id-type="pmid">32289663</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>T.</given-names></name> <name><surname>Hosseini</surname> <given-names>S.</given-names></name> <name><surname>Anderson</surname> <given-names>C.</given-names></name> <name><surname>Rinaldi</surname> <given-names>A.</given-names></name> <name><surname>Krishnan</surname> <given-names>R. G.</given-names></name> <name><surname>Martel</surname> <given-names>A. L.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>A generalizable 3D framework and model for self-supervised learning in medical imaging</article-title>. <source>NPJ Dig. Med</source>. <volume>8</volume>:<fpage>639</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-025-02035-w</pub-id><pub-id pub-id-type="pmid">41203891</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Shi</surname> <given-names>R.</given-names></name> <name><surname>Wei</surname> <given-names>D.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Zhao</surname> <given-names>L.</given-names></name> <name><surname>Ke</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Medmnist v2-a large-scale lightweight benchmark for 2D and 3D biomedical image classification</article-title>. <source>Sci. Data</source> <volume>10</volume>:<fpage>41</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41597-022-01721-8</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/532102/overview">Sridhar Sunderam</ext-link>, University of Kentucky, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3009668/overview">Junhan Zhao</ext-link>, University of Chicago, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3313144/overview">Azad Singh</ext-link>, Indian Institute of Technology Jodhpur, India</p>
</fn>
</fn-group>
</back>
</article>