<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Surg.</journal-id><journal-title-group>
<journal-title>Frontiers in Surgery</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Surg.</abbrev-journal-title></journal-title-group>
<issn pub-type="epub">2296-875X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fsurg.2025.1655374</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Artificial intelligence prediction of nonenhancing brain tumor malignancy based on <italic>in vivo</italic> confocal laser endomicroscopic imaging</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author"><name><surname>Chen</surname><given-names>Jiuxu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/2896270/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role></contrib>
<contrib contrib-type="author"><name><surname>Xu</surname><given-names>Yuan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/1606865/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role></contrib>
<contrib contrib-type="author"><name><surname>Abramov</surname><given-names>Irakliy</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/1315592/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Calder&#x00F3;n-Valero</surname><given-names>Carlos E.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3226836/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role></contrib>
<contrib contrib-type="author"><name><surname>On</surname><given-names>Thomas J.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/2663255/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Eschbacher</surname><given-names>Jennifer M.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Li</surname><given-names>Baoxin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/183506/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Preul</surname><given-names>Mark C.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="cor1">&#x002A;</xref><uri xlink:href="https://loop.frontiersin.org/people/62816/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role></contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Computing and Augmented Intelligence, Arizona State University</institution>, <city>Tempe</city>, <state>AZ</state>, <country country="us">United States</country></aff>
<aff id="aff2"><label>2</label><institution>The Loyal and Edith Davis Neurosurgical Research Laboratory, Barrow Neurological Institute, St. Joseph&#x2019;s Hospital and Medical Center</institution>, <city>Phoenix</city>, <state>AZ</state>, <country country="us">United States</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Neuropathology, Barrow Neurological Institute, St. Joseph&#x2019;s Hospital and Medical Center</institution>, <city>Phoenix</city>, <state>AZ</state>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label><bold>Correspondence:</bold> Mark C. Preul <email xlink:href="mailto:neuropub@barrowneuro.org">Neuropub@barrowneuro.org</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-05"><day>05</day><month>01</month><year>2026</year></pub-date>
<pub-date publication-format="electronic" date-type="collection"><year>2025</year></pub-date>
<volume>12</volume><elocation-id>1655374</elocation-id>
<history>
<date date-type="received"><day>27</day><month>06</month><year>2025</year></date>
<date date-type="rev-recd"><day>04</day><month>11</month><year>2025</year></date>
<date date-type="accepted"><day>25</day><month>11</month><year>2025</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 Chen, Xu, Abramov, Calder&#x00F3;n-Valero, On, Eschbacher, Li and Preul.</copyright-statement>
<copyright-year>2026</copyright-year><copyright-holder>Chen, Xu, Abramov, Calder&#x00F3;n-Valero, On, Eschbacher, Li and Preul</copyright-holder><license><ali:license_ref start_date="2026-01-05">https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p></license>
</permissions>
<abstract><sec><title>Background</title>
<p>Although nonenhancing tumors are often thought to be lower grade, malignant regions can be missed on conventional magnetic resonance imaging. Fluorescein-based confocal laser endomicroscopy (CLE) enables real-time, cellular-resolution imaging of brain tissue during tumor resection. It is particularly valuable for evaluating nonenhancing brain tumors. However, CLE interpretation remains subjective. Although CLE has high sensitivity, it is less specific than standard histology. Existing artificial intelligence (AI) models process CLE images as independent frames, neglecting the temporal context that human experts use during interpretation.</p>
</sec><sec><title>Methods</title>
<p>A novel sequence-based deep learning model was developed to classify tumor grade on the basis of CLE image sequences, mimicking the visual reasoning process of expert neuropathologists. CLE images were collected from 16 patients with nonenhancing brain tumors. Each sequence was labeled as high grade or low grade based on neuropathologist interpretation, blinded to final histopathology findings. Visual features were extracted using pretrained backbones (vision transformer, VGG16, ResNet50), followed by temporal modeling with a transformer encoder and temporal convolution. This model was compared with conventional frame-based classification across 3 random train-test splits.</p>
</sec><sec><title>Results</title>
<p>The dataset included 105 CLE sequences (3,173 images, 40 regions of interest). The sequence-based model achieved top-1 classification accuracies of 93&#x0025; (vision transformer), 88&#x0025; (VGG16), 74&#x0025; (ResNet50), and 67&#x0025; (Inception-ResNet-V2), outperforming corresponding frame-based models (78&#x0025;, 74&#x0025;, 55&#x0025;, and 50&#x0025;, respectively). Diagnostic performance was comparable to expert neuropathologist interpretation (87&#x0025;). The model demonstrated robustness in artifact-affected sequences and improved interpretability by incorporating temporal progression.</p>
</sec><sec><title>Conclusions</title>
<p>AI models that integrate both visual and temporal information from CLE digital imaging sequences can effectively classify brain tumor grade with accuracy comparable to that of expert neuropathologists, outperforming frame-based models. Such a system reduces interpretive subjectivity and holds promise as an intraoperative decision CLE support tool for nonenhancing brain tumor resection.</p>
</sec>
</abstract>
<kwd-group>
<kwd>artificial intelligence</kwd>
<kwd>deep learning</kwd>
<kwd>computer vision</kwd>
<kwd>confocal laser endomicroscopy</kwd>
<kwd>nonenhancing brain tumor</kwd>
<kwd>low-grade glioma</kwd>
<kwd>high-grade glioma</kwd>
</kwd-group><funding-group><award-group id="gs1"><funding-source id="sp1"><institution-wrap><institution>Barrow Neurological Foundation</institution><institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/100009797</institution-id></institution-wrap></funding-source></award-group><funding-statement>The author(s) declare financial support was received for the research and/or publication of this article. This study was supported by the Newsome Chair in Neurosurgery Research held by MP and by the Barrow Neurological Foundation.</funding-statement></funding-group><counts>
<fig-count count="5"/>
<table-count count="4"/><equation-count count="0"/><ref-count count="31"/><page-count count="11"/><word-count count="0"/></counts><custom-meta-group><custom-meta><meta-name>section-at-acceptance</meta-name><meta-value>Neurosurgery</meta-value></custom-meta></custom-meta-group>
</article-meta>
</front>
<body><sec id="s2" sec-type="intro"><label>1</label><title>Introduction</title>
<p>The 2021 World Health Organization (WHO) classification of central nervous system (CNS) tumors has significantly reshaped the diagnosis and management of gliomas and other primary brain tumors, emphasizing molecular parameters alongside histopathological features (<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>). This updated classification has stimulated advanced research in radiomics and multi-omics profiling to improve brain tumor subtyping and guide precision therapeutics (<xref ref-type="bibr" rid="B3">3</xref>, <xref ref-type="bibr" rid="B4">4</xref>). This shift has important implications for nonenhancing gliomas, which occur in adult and pediatric populations (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>). Thirty to forty percent of gliomas that do not enhance on preoperative magnetic resonance imaging (MRI) harbor anaplastic regions or are completely malignant according to previously published studies (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>). Based on their imaging appearance, these tumors are often mistakenly interpreted preoperatively as &#x201C;low-grade&#x201D; gliomas and thus are not properly categorized until intraoperative tissue assessment is performed. Proper identification and biopsy of these more aggressive regions during surgery is crucial to avoid histological undergrading and guide appropriate adjuvant treatment.</p>
<p>Confocal laser endomicroscopy (CLE) is a US Food and Drug Administration&#x2013;cleared, real-time intraoperative imaging modality that captures cellular resolution images of brain tissue using fluorescein sodium (FNa) as a contrast agent. During tumor resection, the surgeon-handheld CLE probe comes into contact with the tissue surface, generating approximately 1 image every 1.3&#x2005;s. After being given intravenously (in usual circumstances and according to current neurosurgical protocol, approximately 5&#x2013;10&#x2005;min before imaging), FNa leaks through the abnormal tumor blood brain barrier. FNa remains in the extracellular space, and its fluorescence lights up the background of the images. The cells are represented as dark round silhouettes of various sizes, morphology, and density. A few minutes of intraoperative imaging results in sequences that often comprise dozens or even hundreds of images within a single examination session. Neuropathologists and neurosurgeons interpret these sequences of CLE images in real time to differentiate between lesional and healthy tissue based on various characteristics revealed by the images, such as cellular density, cellular heterogeneity, and histoarchitectural background.</p>
<p>At tumor margins, CLE has demonstrated excellent sensitivity for detecting tumor infiltration. Additionally, this tool has shown promise in assisting with the determination of tumor (i.e., glioma) grade during resections of nonenhancing brain tumors. However, the interrater reliability of CLE interpretation by neuropathologists who are well-trained in CLE imaging is significantly lower than that associated with conventional hematoxylin and eosin (H&#x0026;E)&#x2013;stained pathology images, especially at the tumor margins, suggesting an inherent uncertainty in CLE images that contributes to subjectivity in their interpretation (<xref ref-type="bibr" rid="B9">9</xref>). Efforts have therefore been made to enhance the interpretation of CLE images. Furthermore, nonenhancing tumors present unique challenges in CLE imaging because they have relatively intact blood-brain barriers that are less permeable to gadolinium contrast and FNa diffusion, resulting in darker images with pathognomonic features or histoarchitectural content that is difficult to identify (<xref ref-type="bibr" rid="B10">10</xref>).</p>
<p>With the FNa-based CLE system, about 50&#x0025; of images show artifacts due to movement either by the surgeon or the brain itself, or by red blood cells which can be confused with small tumor cells. The number of images collected can become overwhelming, especially when attempting to select relevant, informative, and actionable images for the surgeon. Artificial intelligence (AI) models have been proposed to detect diagnostic frames, transform grayscale images into H&#x0026;E-stained histology images, localize diagnostic features, and classify CLE images (<xref ref-type="bibr" rid="B11">11</xref>&#x2013;<xref ref-type="bibr" rid="B15">15</xref>). One common downside of these methods is that the datasets consist of single independent CLE images, thereby ignoring the sequential information that human interpreters often incorporate when the images are sequenced. This difference creates a fundamental gap between human interpretive behavior and the current AI model design.</p>
<p>This feasibility study aimed to bridge that gap by developing and applying an AI system that models the sequential visual reasoning process used by expert human interpreters when interpreting CLE image sequences. By capturing temporal context and visual progression across frames from sequentially acquired CLE images, our proposed system seeks to facilitate more accurate, consistent, and clinically relevant intraoperative assessments of tumor grade in brain tumors that do not significantly enhance on preoperative MRI. Such a system could enhance surgical decision-making when using CLE to evaluate these diagnostically complex tumors.</p>
</sec>
<sec id="s3" sec-type="methods"><label>2</label><title>Materials and methods</title>
<sec id="s3a"><label>2.1</label><title>CLE image acquisition</title>
<p>The CLE images used in this study were collected during clinical studies conducted at Barrow Neurological Institute and approved by the Institutional Review Board of Human Research at St. Joseph&#x0027;s Hospital and Medical Center (<xref ref-type="bibr" rid="B16">16</xref>, <xref ref-type="bibr" rid="B17">17</xref>). We used AI techniques to analyze CLE images acquired intraoperatively from patients who underwent brain tumor surgery at Barrow Neurological Institute, St. Joseph&#x0027;s Hospital and Medical Center, Phoenix, Arizona, between 2010 and 2023. All participants provided written informed consent prior to their participation.</p>
<p>Only CLE images from nonenhancing brain mass lesions identified on preoperative MRI, along with confirmed histopathology and assigned WHO CNS grade, were included in the analysis. Images were acquired using 1 of 2 CLE systems: a clinical system (CONVIVO, Carl Zeiss Meditec AG, Jena, Germany) or a preclinical system (Five1, Optiscan Imaging Ltd., Mulgrave, Australia). Both systems use similar technology developed by Optiscan Imaging Ltd. and generate comparable image quality, differing mainly in the user interface, image dimensions, and functional features. For all cases, FNa was administered intravenously at a dose of 5&#x2005;mg/kg within minutes before CLE image acquisition. CLE images with H&#x0026;E-stained histologic sections from the same imaging spot were collected for analysis.</p>
</sec>
<sec id="s3b"><label>2.2</label><title>CLE image interpretation by a neuropathologist</title>
<p>All CLE images were reviewed by a single neuropathologist (J.M.E.) with 15 years of experience and expertise in interpreting CLE images. Each CLE region of interest (ROI) was evaluated, and an overall high- or low-grade interpretation was assigned to the entire sequence. The neuropathologist remained blinded to the final pathological diagnosis during the evaluation. Atypical features (e.g., hypercellularity, cellular pleomorphism, necrosis, and microvascular proliferation) were documented and used as indicators of higher tumor grade and malignancy. In contrast, normal or mildly increased cellularity without evident cellular pleomorphism suggested lower-grade tumors. CLE interpretations were subsequently compared with the corresponding histopathological diagnosis and H&#x0026;E-stained sections, which served as the ground truth for the neuropathologist&#x0027;s interpretation and the AI model&#x0027;s classification.</p>
</sec>
<sec id="s3c"><label>2.3</label><title>Data preprocessing for AI-based CLE image analysis</title>
<p>After manually excluding CLE image sequences entirely affected by blood or motion artifacts, the remaining images underwent standardized preprocessing, which consisted of 2 main steps: (1) normalizing image dimensions to the [0,1] range to ensure numerical stability, maintain dataset consistency, and prevent gradient explosion during the training process; and (2) downsizing the image to meet the input requirements of the vision backbone architectures.</p>
</sec>
<sec id="s3d"><label>2.4</label><title>Neural network design for the sequence-based approach</title>
<p>Two approaches were employed for the AI-based interpretation of CLE images: a conventional single-frame approach (<xref ref-type="bibr" rid="B14">14</xref>, <xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B18">18</xref>) and a newly proposed sequence-based approach. A new neural network architecture composed of multiple extraction components was developed for the sequence-based approach (<xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>).</p>
<fig id="F1" position="float"><label>Figure&#x00A0;1</label>
<caption><p>The architecture of the newly proposed sequence-based classification model is designed to capture both spatial and temporal information from confocal laser endomicroscopy (CLE) image sequences. Following standardized preprocessing, each frame within a sequence is passed through 1 of 4 visual backbone models (Inception-ResNet-V2, ResNet50, VGG16, or vision transformer) to extract high-dimensional spatial feature embeddings. These visual embeddings are then input into a transformer encoder, which models temporal dependencies across the sequence, followed by a temporal convolutional layer that further aggregates sequential information and reduces dimensionality. The resulting visual-temporal representation is fed into a fully connected layer with a Softmax activation to produce a binary prediction label, which is compared against the ground truth for tumor grade classification. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fsurg-12-1655374-g001.tif"><alt-text content-type="machine-generated">Flowchart illustrating a process where a CLE image sequence undergoes data preprocessing. It enters visual backbone models like Inception-ResNet-V2, ResNet50, VGG16, and Vision Transformer to create visual embeddings. These embeddings are processed by a Transformer Encoder with layers such as multihead self-attention, feed-forward network, and a fully connected layer. Visual-temporal features undergo temporal convolution leading to binary softmax classification. The output includes exam predictions (high grade) and ground truth (low grade), using binary cross-entropy loss function and model training on classification error.</alt-text>
</graphic>
</fig>
<sec id="s3d1"><label>2.4.1</label><title>Visual feature extraction</title>
<p>First, to extract spatial features from each CLE image, we employed neural network models that were pretrained on the ImageNet V2 dataset as visual backbone architectures (<xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B20">20</xref>). Several well-established architecture models [Inception-ResNet-V2 (<xref ref-type="bibr" rid="B21">21</xref>), ResNet50 (<xref ref-type="bibr" rid="B22">22</xref>), VGG16 (<xref ref-type="bibr" rid="B23">23</xref>), and vision transformer (<xref ref-type="bibr" rid="B24">24</xref>)] have demonstrated strong performance in medical imaging and proven effectiveness in image representation learning. To comprehensively evaluate their performance on our dataset, all 4 architecture models were integrated into our model in parallel. Each visual backbone architecture processes CLE images and encodes them into high-dimensional feature vectors. For a given sequence of CLE images, this results in a corresponding sequence of images that serves as input to the subsequent temporal modeling module.</p>
</sec>
<sec id="s3d2"><label>2.4.2</label><title>Temporal modeling</title>
<p>The extracted visual features are passed into a transformer encoder to model the temporal relationships and dependencies across the CLE image sequence (<xref ref-type="bibr" rid="B25">25</xref>). To maintain the temporal order of CLE frames, positional encoding is added to the input feature sequence, allowing the model to learn the relative and absolute positions of frames within the examination. This enables the transformer encoder to model the relative and absolute positions of frames, which is essential for capturing disease progression patterns in sequential CLE images.</p>
</sec>
<sec id="s3d3"><label>2.4.3</label><title>Temporal convolution and dimensional reduction</title>
<p>To further aggregate temporal information and reduce the sequence dimension, the output of the transformer encoder is passed through a 1-dimensional temporal convolutional layer (<xref ref-type="bibr" rid="B26">26</xref>). This layer applies filters across the temporal axis to capture local temporal patterns and combine information across neighboring frames. It also compresses the sequence into a single global temporal embedding vector that represents the entire CLE image sequence. Finally, the resulting latent visual-temporal feature is processed by a fully connected layer, followed by a Softmax classifier that outputs a binary label indicating tumor grade: high-grade tumor or low-grade tumor.</p>
</sec>
</sec>
<sec id="s3e"><label>2.5</label><title>Model training and testing</title>
<p>We randomly divided the dataset of 105 CLE image sequences into a training set (78 sequences, 74.3&#x0025;) and a testing set (27 sequences, 25.7&#x0025;). The neuropathologist&#x0027;s interpretation of the matching H&#x0026;E section as high- or low-grade tumor was used as the ground truth label. To ensure robustness and minimize bias from randomization, we conducted experiments on 3 different random splits. In the training stage, we used the cross-entropy loss function to compute classification loss by comparing the predicted high and low tumor grades with the ground truth label. Model parameters were optimized using the adaptive moment estimator optimizer, which facilitated efficient backpropagation and convergence. During testing, the model processed CLE image sequences following the same procedure as in the training stage. Performance was evaluated using top-1 accuracy, calculated on a per-examination basis and reported for each test set in addition to sensitivity, specificity, positive predictive value (PPV) and negative predictive value (NPV). All experiments were conducted using a desktop NVIDIA GeForce RTX 3090 graphics processing unit (GPU) (24 GB memory).</p>
</sec>
</sec>
<sec id="s4" sec-type="results"><label>3</label><title>Results</title>
<sec id="s4a"><label>3.1</label><title>Descriptive analysis</title>
<p>In total, 105 CLE image sequences containing 3,173 CLE images from 40 ROIs across 16 cases were included in the analysis (<xref ref-type="table" rid="T1">Table&#x00A0;1</xref>). These comprised 2 astrocytomas (12&#x0025;), 10 oligodendrogliomas (63&#x0025;), 1 glioblastoma (6&#x0025;), 2 glioneuronal and neuronal tumors (12&#x0025;), and 1 miscellaneous mass lesion (6&#x0025;). Eight (50&#x0025;) were pathology-confirmed low-grade (WHO CNS grade 1 and 2) lesions, and 8 (50&#x0025;) were pathology-confirmed high-grade (WHO CNS grade 3 and 4) lesions.</p>
<table-wrap id="T1" position="float"><label>Table&#x00A0;1</label>
<caption><p>Tumor histology type and WHO grade of 16 nonenhancing brain tumors.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Histology type</th>
<th valign="top" align="center">WHO grade</th>
<th valign="top" align="center">No. of tumors</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Astrocytoma</td>
<td valign="top" align="center">3</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">Oligodendroglioma</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">5</td>
</tr>
<tr>
<td valign="top" align="center">3</td>
<td valign="top" align="center">5</td>
</tr>
<tr>
<td valign="top" align="left">Glioblastoma</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">Glioneuronal and neuronal tumors</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left">Miscellaneous</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF1"><p>WHO, World Health Organization.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4b"><label>3.2</label><title>Neuropathologist interpretation</title>
<p>The neuropathologist successfully interpreted CLE images from 38 of 40 ROIs. Concurrent hypercellularity and pleomorphism were observed in 23 ROIs, with 18 classified as high-grade based on histology; 15 ROIs did not exhibit hypercellularity or pleomorphism, all corresponding to low-grade pathology (<xref ref-type="fig" rid="F2">Figure&#x00A0;2A</xref>). The overall accuracy of the neuropathologist&#x0027;s interpretation was 87&#x0025;.</p>
<fig id="F2" position="float"><label>Figure&#x00A0;2</label>
<caption><p>Confusion matrices comparing the interpretation by the human expert <bold>(A)</bold> and the artificial intelligence (AI) models&#x2019; classification <bold>(B&#x2013;E)</bold> in differentiating high-grade and low-grade pathology. Each panel shows a confusion matrix in which the <italic>x</italic>-axis represents the ground truth (histologically confirmed tumor grade), and the <italic>y</italic>-axis represents the interpretation by the human expert or the predicted label by the AI model. Color intensity corresponds to the count in each cell, with darker colors indicating higher values. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fsurg-12-1655374-g002.tif"><alt-text content-type="machine-generated">Five confusion matrices labeled A to E compare different methods for grading. Panel A shows Expert Interpretation versus Histology. Panel B shows Inception-ResNet-V2 versus Ground Truth. Panel C depicts ResNet50 versus Ground Truth. Panel D illustrates VGG16 versus Ground Truth. Panel E compares Vision Transformer versus Ground Truth. Each matrix has Low Grade and High Grade as both axes, with numbers indicating quantity at each intersection, represented by color intensity.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4c"><label>3.3</label><title>AI model classification</title>
<p>To evaluate the effectiveness and clinical relevance of our proposed sequence-based CLE image classification framework, we conducted a thorough assessment focusing on 2 main aspects: (1) the influence of the 4 visual backbone models, and (2) the classification accuracy of our sequence-based method compared to traditional frame-based image classification.</p>
<sec id="s4c1"><label>3.3.1</label><title>Visual model comparison</title>
<p>Among the 4 visual backbone models, vision transformer exhibited the strongest performance in capturing complex spatial structures and achieved the highest overall classification accuracy, followed by VGG16, ResNet50, and Inception-ResNet-V2, respectively (<xref ref-type="table" rid="T2">Tables&#x00A0;2</xref>, <xref ref-type="table" rid="T3">3</xref>). This indicated that transformer-based visual models, with their capacity to capture long-range spatial dependencies among image patches, might be particularly effective in analyzing CLE images in which subtle structural changes signal tumor grade. Confusion matrices illustrate the distinct tendency of the Inception-ResNet-V2 model to misclassify low-grade pathology as high-grade pathology (<xref ref-type="fig" rid="F2">Figure&#x00A0;2B</xref>), whereas the classifications from the other visual models were more balanced (<xref ref-type="fig" rid="F2">Figures&#x00A0;2C&#x2013;E</xref>). Class activation maps (<xref ref-type="fig" rid="F3">Figures&#x00A0;3</xref>&#x2013;<xref ref-type="fig" rid="F5">5</xref>) highlighted the areas of the input image that contributed most to specific class predictions.</p>
<fig id="F3" position="float"><label>Figure&#x00A0;3</label>
<caption><p>Class activation maps (CAMs) from 4 visual backbone models on a World Health Organization (WHO) grade 1 multinodular and vacuolating neuronal tumor sample. Representative confocal laser endomicroscopy (CLE) image in its original form <bold>(A)</bold> and after preprocessing <bold>(B)</bold>. CAMs generated by Inception-ResNet-V2 <bold>(C)</bold>, ResNet50 <bold>(D)</bold>, VGG16 <bold>(E)</bold>, and vision transformer <bold>(F)</bold> overlaid on the input image. Inception-ResNet-V2 misclassified the sample as high-grade, and ResNet50, VGG16, and vision transformer correctly classified it as low-grade. The CLE image sequence is shown in <xref ref-type="sec" rid="s13">Supplementary Video S1</xref>. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fsurg-12-1655374-g003.tif"><alt-text content-type="machine-generated">Multinodular and vacuolating neuronal tumor images, WHO grade 1. Panels A and B show grayscale tumor images. Panels C-F illustrate color-coded activation maps from neural networks: Inception-ResNet-V2 (C) indicates high grade, ResNet50 (D), VGG16 (E), and Vision Transformer (F) indicate low grade, with a color scale from low (blue) to high (red) activation.</alt-text>
</graphic>
</fig>
<fig id="F4" position="float"><label>Figure&#x00A0;4</label>
<caption><p>Class activation maps (CAMs) from 4 visual backbone models on a World Health Organization (WHO) grade 3 oligodendroglioma sample. Representative confocal laser endomicroscopy (CLE) image in its original form <bold>(A)</bold> and after preprocessing <bold>(B)</bold>. CAMs generated by Inception-ResNet-V2 <bold>(C)</bold>, ResNet50 <bold>(D)</bold>, VGG16 <bold>(E)</bold>, and vision transformer <bold>(F)</bold>, overlaid on the input image. ResNet50 and VGG16 misclassified the sample as low-grade, and Inception-ResNet-V2 and vision transformer correctly classified the sample as high-grade. The CLE image sequence is shown in <xref ref-type="sec" rid="s13">Supplementary Video S2</xref>. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fsurg-12-1655374-g004.tif"><alt-text content-type="machine-generated">Images A and B show grayscale images representing oligodendroglioma tissue samples, IDH mutant, WHO grade 3. Below, images C to F display heatmaps of the same tissue analyzed by different models: Inception-ResNet-V2, ResNet50, VGG16, and Vision Transformer. Color gradients indicate activation levels, with Inception-ResNet-V2 and Vision Transformer showing high-grade activations, while ResNet50 and VGG16 indicate low-grade activations.</alt-text>
</graphic>
</fig>
<fig id="F5" position="float"><label>Figure&#x00A0;5</label>
<caption><p>Class activation maps (CAMs) from 4 visual backbone models on a World Health Organization (WHO) grade 3 astrocytoma sample. Representative confocal laser endomicroscopy (CLE) image in its original form <bold>(A)</bold> and after preprocessing <bold>(B)</bold>. <bold>(C&#x2013;F)</bold> CAMs generated by Inception-ResNet-V2 <bold>(C)</bold>, ResNet50 <bold>(D)</bold>, VGG16 <bold>(E)</bold>, and vision transformer <bold>(F)</bold> overlaid on the input image. Red to blue colormap indicates the degree of model activation, with warmer colors denoting stronger relevance to the predicted high-grade label. All 4 models correctly classified the sample as high-grade. The CLE image sequence is shown in <xref ref-type="sec" rid="s13">Supplementary Video S3</xref>. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fsurg-12-1655374-g005.tif"><alt-text content-type="machine-generated">CLE images of astrocytoma, IDH mutant, WHO grade 3 are shown in grayscale panels A and B. Panels C, D, E, and F display color maps from Inception-ResNet-V2, ResNet50, VGG16, and Vision Transformer, respectively, indicating high-grade activations with a color scale from low (blue) to high (red).</alt-text>
</graphic>
</fig>
<table-wrap id="T2" position="float"><label>Table&#x00A0;2</label>
<caption><p>Top-1 accuracy of the frame-based classification model.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left" rowspan="2">Split</th>
<th valign="top" align="center" colspan="4">Accuracy, &#x0025;</th>
</tr>
<tr>
<th valign="top" align="center">Inception-ResNet-V2</th>
<th valign="top" align="center">ResNet50</th>
<th valign="top" align="center">VGG16</th>
<th valign="top" align="center">Vision transformer</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="center">49</td>
<td valign="top" align="center">53</td>
<td valign="top" align="center">77</td>
<td valign="top" align="center">82</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="center">53</td>
<td valign="top" align="center">53</td>
<td valign="top" align="center">70</td>
<td valign="top" align="center">75</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="center">49</td>
<td valign="top" align="center">60</td>
<td valign="top" align="center">74</td>
<td valign="top" align="center">76</td>
</tr>
<tr>
<td valign="top" align="left">Overall</td>
<td valign="top" align="center">50</td>
<td valign="top" align="center">55</td>
<td valign="top" align="center">74</td>
<td valign="top" align="center">78</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float"><label>Table&#x00A0;3</label>
<caption><p>Top-1 accuracy of the sequence-based classification model.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left" rowspan="2">Split</th>
<th valign="top" align="center" colspan="4">Accuracy, &#x0025;</th>
</tr>
<tr>
<th valign="top" align="center">Inception-ResNet-V2</th>
<th valign="top" align="center">ResNet50</th>
<th valign="top" align="center">VGG16</th>
<th valign="top" align="center">Vision transformer</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="center">68</td>
<td valign="top" align="center">71</td>
<td valign="top" align="center">93</td>
<td valign="top" align="center">96</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="center">64</td>
<td valign="top" align="center">68</td>
<td valign="top" align="center">82</td>
<td valign="top" align="center">86</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="center">68</td>
<td valign="top" align="center">78</td>
<td valign="top" align="center">89</td>
<td valign="top" align="center">89</td>
</tr>
<tr>
<td valign="top" align="left">Overall</td>
<td valign="top" align="center">67</td>
<td valign="top" align="center">74</td>
<td valign="top" align="center">88</td>
<td valign="top" align="center">93</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4c2"><label>3.3.2</label><title>Frame-based and sequence-based classification models</title>
<p>The classic frame-based classification model yielded overall top-1 accuracies of 78&#x0025;, 74&#x0025;, 55&#x0025;, and 50&#x0025; with vision transformer, VGG16, ResNet50, and Inception-ResNet-V2, respectively, as the visual backbone (<xref ref-type="table" rid="T2">Table&#x00A0;2</xref>). In comparison, the sequence-based model produced tumor grade classification with significantly higher top-1 accuracies (vision transformer, 90&#x0025;; VGG16, 88&#x0025;; ResNet50, 73&#x0025;; Inception-ResNet-V2, 67&#x0025;; <xref ref-type="table" rid="T3">Table&#x00A0;3</xref>). Furthermore, with vision transformer and VGG16, the performance of the sequence-based model was comparable to that of the human neuropathologist&#x0027;s interpretation (87&#x0025;) with excellent sensitivity, specificity, PPV, and NPV (<xref ref-type="table" rid="T4">Table&#x00A0;4</xref>).</p>
<table-wrap id="T4" position="float"><label>Table&#x00A0;4</label>
<caption><p>Sensitivity, specificity, positive predictive value, and negative predictive value of the sequence-based model using vision transformer and VGG16 as the visual backbone.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left"/>
<th valign="top" align="center">Vision transformer, &#x0025;</th>
<th valign="top" align="center">VGG16, &#x0025;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Sensitivity</td>
<td valign="top" align="center">92</td>
<td valign="top" align="center">92</td>
</tr>
<tr>
<td valign="top" align="left">Specificity</td>
<td valign="top" align="center">88</td>
<td valign="top" align="center">82</td>
</tr>
<tr>
<td valign="top" align="left">PPV</td>
<td valign="top" align="center">92</td>
<td valign="top" align="center">88</td>
</tr>
<tr>
<td valign="top" align="left">NPV</td>
<td valign="top" align="center">88</td>
<td valign="top" align="center">87</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF2"><p>PPV, positive predictive value; NPV, negative predictive value.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="discussion"><label>4</label><title>Discussion</title>
<p>Since the introduction of CLE imaging technology, efforts have been made to apply machine learning and deep learning methods to assist in interpreting CLE images (<xref ref-type="bibr" rid="B11">11</xref>&#x2013;<xref ref-type="bibr" rid="B15">15</xref>). However, earlier studies focused on the analysis of individual images. To our knowledge, this is the first study to process sequentially acquired images to extract temporal information. Our AI model performed well in predicting the tumor grade of nonenhancing brain tumors based on CLE imaging. It demonstrated promise in intraoperative diagnosis and guidance of surgical maneuvers related to the tumor, including identifying tissue regions for further imaging and biopsy.</p>
<sec id="s5a"><label>4.1</label><title>Models for image analysis and classification</title>
<p>The 4 visual backbone models process images in different ways, as evidenced by the varying high-activation regions (<xref ref-type="fig" rid="F3">Figures&#x00A0;3</xref>&#x2013;<xref ref-type="fig" rid="F5">5</xref>). For Inception-ResNet-V2, ResNet50, and VGG16, the neural networks appeared to be activated to different degrees by the areas of high cellular density (<xref ref-type="fig" rid="F3">Figures&#x00A0;3C&#x2013;E</xref>, <xref ref-type="fig" rid="F4">4C&#x2013;E</xref>, <xref ref-type="fig" rid="F5">5C&#x2013;E</xref>), whereas the specific activation patterns did not always align with the regions that human experts focus on. Therefore, such AI model applications offer valuable dimensions to image analysis that can identify, concentrate on, and interpret features that might be overlooked by human interpretation or involve subtleties with significant implications.</p>
<p>VGG16 is a classic deep learning model composed of many simple layers that use small convolutional filters to extract detailed visual features progressively. Its straightforward and uniform architecture allows the model to focus on local textures and visually distinct patterns, which is advantageous for analyzing CLE images in which dense cellular regions are often key indicators. In our study, VGG16 achieved an accuracy of 88&#x0025; in the sequence-based classification task, closely following vision transformer. Although its reliance on localized features can be a limitation in more subtle or heterogeneous cases, the activation maps reveal strong and focused attention on hypercellular areas, which aligns with the model&#x0027;s tendency to respond to clear localized signals.</p>
<p>ResNet50 introduces shortcut or &#x201C;residual&#x201D; connections that allow the model to preserve information across layers, enabling it to learn more abstract patterns. Despite this innovative design, ResNet50 achieved a relatively low accuracy (73&#x0025;) in our study. Although it excels at detecting obvious features, it is less effective at integrating global context, which is crucial in interpreting CLE images with diffuse or ambiguous findings. In activation maps, ResNet50&#x0027;s narrower focus was associated with misclassification errors, suggesting that it missed more subtle contextual cues, corresponding to prominent histoarchitectural features, in the overall frame (<xref ref-type="fig" rid="F3">Figures&#x00A0;3D</xref>, <xref ref-type="fig" rid="F4">4D</xref>, <xref ref-type="fig" rid="F5">5D</xref>).</p>
<p>Inception-ResNet-V2 combines 2 advanced design concepts. The Inception module allows the model to analyze features at multiple scales simultaneously, and the residual connections help preserve information as it passes through many layers. This architecture can be overly sensitive to complex textures and subtle variations. The activation maps demonstrated excessive attention to what appears to resemble hypercellularity, whereas other parts of the image that showed low-grade features were ignored, indicating that the model tended to interpret benign structural complexity as malignancy (<xref ref-type="fig" rid="F3">Figures&#x00A0;3C</xref>, <xref ref-type="fig" rid="F4">4C</xref>, <xref ref-type="fig" rid="F5">5C</xref>). This led to the obvious tendency to overclassify low-grade pathology as high-grade pathology, indicated by the confusion matrix (<xref ref-type="fig" rid="F2">Figure&#x00A0;2B</xref>).</p>
<p>Vision transformer divides an image into small, fixed-size patches and converts each patch into a token composed of numbers. It then processes these tokens using self-attention mechanisms to learn how they are related to each other and build a global understanding of the image, without relying on traditional convolutional operations. Vision transformer produces more dispersed activation patterns due to its patch-based processing and lack of spatial inductive bias, making the high-activation areas less intuitive to human visual inspection and less evident in terms of histoarchitectural characteristics in the image (<xref ref-type="fig" rid="F3">Figures&#x00A0;3F</xref>, <xref ref-type="fig" rid="F4">4F</xref>, <xref ref-type="fig" rid="F5">5F</xref>).</p>
<p>In a multinodular and vacuolating neuronal tumor, regions of hypercellularity were observed in the image, but there was no significant cellular atypia or heterogeneity. ResNet50, VGG16, and vision transformer accurately classified this sample as a low-grade tumor (<xref ref-type="fig" rid="F3">Figures&#x00A0;3D&#x2013;F</xref>, <xref ref-type="sec" rid="s13">Supplementary Video S1</xref>), whereas Inception-ResNet-V2 appeared to focus on the hypercellularity and mistakenly classified the tumor as a high-grade pathology (<xref ref-type="fig" rid="F3">Figure&#x00A0;3C</xref>). In another WHO grade 3 oligodendroglioma sample, both ResNet50 and VGG16 were activated by the hypercellular regions and incorrectly classified the tumor as low-grade (<xref ref-type="fig" rid="F4">Figures&#x00A0;4D,E</xref>, <xref ref-type="sec" rid="s13">Supplementary Video S2</xref>), whereas the activation of Inception-ResNet-V2 and vision transformer resulted in the correct classification (<xref ref-type="fig" rid="F4">Figures&#x00A0;4C,F</xref>). All 4 visual models correctly classified the sample from a WHO grade 3 astrocytoma as high-grade, although the activation pattern varied significantly (<xref ref-type="fig" rid="F5">Figure&#x00A0;5</xref>, <xref ref-type="sec" rid="s13">Supplementary Video S3</xref>).</p>
</sec>
<sec id="s5b"><label>4.2</label><title>Extraction, classification, and reasoning of CLE image features</title>
<p>For our experiments, before preprocessing, CLE image sequences that were completely affected by red blood cell or motion artifacts were manually removed. We did not use convolutional neural network-based diagnostic frame detection model (<xref ref-type="bibr" rid="B27">27</xref>) to filter single artifactual images to avoid disrupting the entirety of the image sequences.</p>
<p>To extract temporal features from the CLE image sequences, we selected the transformer encoder. It is equipped with 2 multihead self-attention layers that enable it to capture long-range temporal correlations and the sequential progression of CLE image content. Recurrent neural network&#x2013;based models, such as long short-term memory (LSTM) networks (<xref ref-type="bibr" rid="B28">28</xref>), are theoretically suitable for this task. However, these models operate in an autoregressive manner, predicting each output step based on the previous one, which can lead to recurrent error propagation over long sequences. In contrast, the transformer encoder allows for at-once inference across the entire sequence, mitigating this issue through the use of self-attention encoding in parallel.</p>
<p>We compared our sequence-based classification model against a conventional frame-based classification model as the baseline. Although simple, the frame-based model overlooks temporal progression and contextual continuity, often resulting in erratic or inconsistent predictions due to frame-level noise or artifacts. In contrast, our sequence-based classification model mimics how a human expert evaluates the entire CLE examination, leveraging temporal modeling with a transformer encoder and local pattern extraction embedded in temporal convolutions to create a robust, temporally coherent representation of the full examination. This led to more stable and clinically aligned predictions, especially in borderline cases or instances with consecutive images affected by blood or motion artifacts.</p>
<p>Importantly, our method mimics how human experts interpret CLE imaging in a real-world setting. Human experts not only assess single frames in isolation but also gather information from sequential frames. In the first clinical feasibility study with the CLE system, it was apparent that our CLE-trained neuropathologist preferred examining whole sequences to uncover histoarchitectural features that individual static images do not reveal (<xref ref-type="bibr" rid="B29">29</xref>). This distinction is crucial when differentiating between blood contamination and densely packed tumor cells. Small, uniformly sized, disc-shaped cells that flow across the field of view in consecutive frames are more likely to be red blood cells entering the CLE imaging site and can thus be differentiated from relatively stable, motionless tumor cells. Additionally, the human brain&#x0027;s ability to parse information from unaffected areas when examining a series of images partially affected by motion artifacts allows for better utilization of these otherwise unusable images.</p>
<p>By explicitly modeling CLE as a temporal sequence and predicting at the examination level, our system emulates these human reasoning processes. This alignment between model behavior and clinical workflow enhances both the interpretability and trustworthiness of AI-assisted diagnostics, making it a better candidate for real-world intraoperative support. In turn, all visual backbone models benefited from being embedded in our sequence-aware architecture, showing clear improvements over their standalone, frame-based counterparts.</p>
</sec>
<sec id="s5c"><label>4.3</label><title>Integration of AI-based CLE image analysis into current intraoperative workflow</title>
<p>There are several features that greatly enhance the integration of CLE imaging into current intraoperative neurosurgical workflow. Martirosyan et al. previously reported successful co-registration of the CLE probe with the neuronavigation system to localize the precise CLE imaging site (<xref ref-type="bibr" rid="B17">17</xref>). Recently, Muscas et al. demonstrated the feasibility of connecting the CLE system (Zeiss CONVIVO), the neuronavigation system (Medtronic StealthStation S8), and the surgical microscope (Zeiss Kinevo 900) to enabled picture-in-picture display of CLE images in the surgical microscope visual field. This configuration reduced CLE imaging time and increased the proportion of usable CLE images by allowing the surgeon to assess image quality without diverting their gaze from the operative field (<xref ref-type="bibr" rid="B30">30</xref>). In addition, a built-in telepathology software platform allows for real-time transfer of CLE images from the operating room to the neuropathologist&#x0027;s device, along with voice communication between the neurosurgeon and neuropathologist (<xref ref-type="bibr" rid="B31">31</xref>).</p>
<p>Coupling our sequence-based AI model with an integrated surgical visualization ecosystem could provide neurosurgeons and neuropathologists with additional diagnostic information. This model can potentially be generalized to other CLE and intraoperative imaging systems that generate sequential images. Importantly, our approach is not limited by computing power. We used a consumer-grade GPU for model training and testing. For the 4 visual backbones, average inference times per sequence were as follows: Inception-ResNet-V2, 0.17&#x2005;s; ResNet50, 0.11&#x2005;s; VGG16, 0.19&#x2005;s; and vision transformer, 0.33&#x2005;s. With cloud-based deployment, our model can be pretrained and executed remotely on professional AI-optimized GPUs to provide near real-time feedback after image acquisition, enabling seamless integration with intraoperative CLE workflow.</p>
<p>Combined with <italic>in vivo</italic> CLE imaging, this tool would allow immediate intraoperative sampling of multiple spots within a nonenhancing tumor. Regions suspicious for higher tumor grade can be identified and biopsy performed to further confirm tumor grade and potentially avoid histological undergrading. Integration of such an AI-assisted system into the intraoperative workflow may ultimately enhance surgical precision and improve diagnostic yield while streamlining decision-making and improving workflow efficiency during brain tumor surgeries. However, prior to being implemented in clinical practice, our AI model needs to be refined. A more comprehensive dataset collected from multiple institutions and annotated uniformly should be used to further train and validate the model. A more balanced distribution of tumor histology type and WHO grade would be highly desirable. Prospective clinical studies should be considered to determine the reliability, clinical benefit, and workflow impact of the model.</p>
</sec>
<sec id="s5d"><label>4.4</label><title>Limitations</title>
<p>The relatively small sample size and single-institution nature limited this study. However, the point of this study was a proof of principle, procedure, and exploration of appropriate and applicable AI image assessment programs to engage with the modes of CLE image acquisition and interpretation. All CLE images were acquired at a single neurosurgical center using standardized imaging protocols and interpreted by a single neuropathologist, which may not fully represent the variability of intraoperative imaging conditions, tumor pathologies, and inter-rater variability across other institutions. To mitigate potential bias and reduce randomization errors from assigning CLE image sequences to training and testing sets, we repeated all experiments across 3 different random splits and reported the average top-1 accuracy. This approach aimed to ensure a more robust and reliable performance evaluation despite the dataset size constraint. The results of this study will inform sample size calculations for future, larger-scale experiments aimed at validating the conclusions of this preliminary work. Furthermore, more than half of the cases in our dataset were oligodendrogliomas, whereas astrocytomas were significantly underrepresented. This may limit the generalizability of our results; nevertheless, we consider this feasibility study to be an important first step. Future expansion of the dataset with a more balanced representation of tumor types will likely enhance the robustness and performance of our model.</p>
</sec>
</sec>
<sec id="s6" sec-type="conclusions"><label>5</label><title>Conclusions</title>
<p>This study presents a novel image sequence&#x2013;based AI framework that significantly enhances the interpretation of CLE image sequences for intraoperative assessment of surgically and diagnostically complex nonenhancing brain tumors. By incorporating temporal context through a transformer encoder and integrating advanced visual backbone models, the proposed system outperforms conventional frame-based approaches, achieving accuracy comparable to that of an expert neuropathologist. This combined visual-temporal approach not only improves classification consistency and robustness, particularly in sequences affected by artifacts, but also aligns closely with the real-world interpretations of human experts. When hundreds to thousands of CLE images are acquired intraoperatively in real time, such AI systems may rapidly interpret images, especially those of brain tumors (i.e., gliomas) that exhibit high heterogeneity or possess complex imaging features. These findings highlight the potential of merging sequence-based deep learning models with CLE to support clinical decision-making during nonenhancing tumor resections, thereby reducing subjectivity in CLE interpretation and streamlining intraoperative surgical decisions.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability"><title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s13">Supplementary Material</xref>, and further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s8" sec-type="ethics-statement"><title>Ethics statement</title>
<p>The studies involving humans were approved by Institutional Review Board of Human Research at St. Joseph&#x0027;s Hospital and Medical Center. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study.</p>
</sec>
<sec id="s9" sec-type="author-contributions"><title>Author contributions</title>
<p>JC: Methodology, Conceptualization, Formal analysis, Software, Writing &#x2013; original draft, Visualization. YX: Formal analysis, Writing &#x2013; original draft, Methodology, Data curation, Conceptualization. IA: Conceptualization, Data curation, Writing &#x2013; review &#x0026; editing. CC-V: Writing &#x2013; original draft, Formal analysis. TO: Formal analysis, Writing &#x2013; review &#x0026; editing. JE: Data curation, Writing &#x2013; review &#x0026; editing. BL: Supervision, Methodology, Writing &#x2013; review &#x0026; editing. MP: Funding acquisition, Conceptualization, Project administration, Writing &#x2013; review &#x0026; editing, Supervision.</p>
</sec>
<sec id="s11" sec-type="COI-statement"><title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The author(s) declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
<p>The reviewer FR declared a past collaboration with the authors to the handling editor at the time of review.</p>
</sec>
<sec id="s12" sec-type="ai-statement"><title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s14" sec-type="disclaimer"><title>Publisher&#x0027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s1"><title>Author disclaimer</title>
<p>Part of this manuscript was previously presented as a poster at the 91st American Academy of Neurological Surgeons Annual Scientific Meeting, April 21-24, 2023, in Los Angeles, California, USA.</p>
</sec>
<sec id="s13" sec-type="supplementary-material"><title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fsurg.2025.1655374/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fsurg.2025.1655374/full&#x0023;supplementary-material</ext-link></p>
<supplementary-material xlink:href="Video1.mp4" id="SM1" mimetype="video/mpeg">
<label>Supplementary Video S1</label>
<caption><p>A confocal laser endomicroscopy image sequence from a case of WHO grade 1 multinodular and vacuolating neuronal tumor. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
</supplementary-material>
<supplementary-material xlink:href="Video2.mp4" id="SM2" mimetype="video/mpeg">
<label>Supplementary Video S2</label>
<caption><p>A confocal laser endomicroscopy image sequence from a case of WHO grade 3 oligodendroglioma. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
</supplementary-material>
<supplementary-material xlink:href="Video3.mp4" id="SM3" mimetype="video/mpeg">
<label>Supplementary Video S3</label>
<caption><p>A confocal laser endomicroscopy image sequence from a case of WHO grade 3 astrocytoma. <italic>Used with permission from Barrow Neurological Institute, Phoenix, Arizona</italic>.</p></caption>
</supplementary-material>
</sec>
<ref-list><title>References</title>
<ref id="B1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Torp</surname> <given-names>SH</given-names></name> <name><surname>Solheim</surname> <given-names>O</given-names></name> <name><surname>Skjulsvik</surname> <given-names>AJ</given-names></name></person-group>. <article-title>The WHO 2021 classification of central nervous system tumours: a practical update on what neurosurgeons need to know-a minireview</article-title>. <source>Acta Neurochir</source>. (<year>2022</year>) <volume>164</volume>(<issue>9</issue>):<fpage>2453</fpage>&#x2013;<lpage>64</lpage>. <pub-id pub-id-type="doi">10.1007/s00701-022-05301-y</pub-id><pub-id pub-id-type="pmid">35879477</pub-id></mixed-citation></ref>
<ref id="B2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Komori</surname> <given-names>T</given-names></name></person-group>. <article-title>The 2021 WHO classification of tumors, 5th edition, central nervous system tumors: the 10 basic principles</article-title>. <source>Brain Tumor Pathol</source>. (<year>2022</year>) <volume>39</volume>(<issue>2</issue>):<fpage>47</fpage>&#x2013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1007/s10014-022-00428-3</pub-id><pub-id pub-id-type="pmid">35316415</pub-id></mixed-citation></ref>
<ref id="B3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>Y</given-names></name> <name><surname>Xu</surname> <given-names>Z</given-names></name> <name><surname>Hu</surname> <given-names>W</given-names></name> <name><surname>Deng</surname> <given-names>P</given-names></name> <name><surname>Ma</surname> <given-names>M</given-names></name> <name><surname>Wu</surname> <given-names>J</given-names></name></person-group>. <article-title>Comprehensive multi-omics and machine learning framework for glioma subtyping and precision therapeutics</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>(<issue>1</issue>):<fpage>24874</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-09742-0</pub-id><pub-id pub-id-type="pmid">40640380</pub-id></mixed-citation></ref>
<ref id="B4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khilar</surname> <given-names>S</given-names></name> <name><surname>Dembinska-Kenner</surname> <given-names>A</given-names></name> <name><surname>Hall</surname> <given-names>H</given-names></name> <name><surname>Syrmos</surname> <given-names>N</given-names></name> <name><surname>Ligarotti</surname> <given-names>GKI</given-names></name> <name><surname>Plaha</surname> <given-names>P</given-names></name><etal/></person-group> <article-title>Towards a new dawn for neuro-oncology: nanomedicine at the service of drug delivery for primary and secondary brain tumours</article-title>. <source>Brain Sci</source>. (<year>2025</year>) <volume>15</volume>(<issue>2</issue>):<fpage>136</fpage>. <pub-id pub-id-type="doi">10.3390/brainsci15020136</pub-id><pub-id pub-id-type="pmid">40002469</pub-id></mixed-citation></ref>
<ref id="B5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>YW</given-names></name> <name><surname>Kim</surname> <given-names>S</given-names></name> <name><surname>Han</surname> <given-names>K</given-names></name> <name><surname>Ahn</surname> <given-names>SS</given-names></name> <name><surname>Moon</surname> <given-names>JH</given-names></name> <name><surname>Kang</surname> <given-names>S-G</given-names></name><etal/></person-group> <article-title>Rethinking extent of resection of contrast-enhancing and non-enhancing tumor: different survival impacts on adult-type diffuse gliomas in 2021 World Health Organization classification</article-title>. <source>Eur Radiol</source>. (<year>2024</year>) <volume>34</volume>(<issue>2</issue>):<fpage>1376</fpage>&#x2013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1007/s00330-023-10125-0</pub-id><pub-id pub-id-type="pmid">37608093</pub-id></mixed-citation></ref>
<ref id="B6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Halfpenny</surname> <given-names>AM</given-names></name> <name><surname>Wood</surname> <given-names>MD</given-names></name></person-group>. <article-title>Review of the recent changes in the WHO classification for pediatric brain and spinal cord tumors</article-title>. <source>Pediatr Neurosurg</source>. (<year>2023</year>) <volume>58</volume>(<issue>5</issue>):<fpage>337</fpage>&#x2013;<lpage>55</lpage>. <pub-id pub-id-type="doi">10.1159/000528957</pub-id><pub-id pub-id-type="pmid">36617415</pub-id></mixed-citation></ref>
<ref id="B7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Eichberg</surname> <given-names>DG</given-names></name> <name><surname>Di</surname> <given-names>L</given-names></name> <name><surname>Morell</surname> <given-names>AA</given-names></name> <name><surname>Shah</surname> <given-names>AH</given-names></name> <name><surname>Semonche</surname> <given-names>AM</given-names></name> <name><surname>Chin</surname> <given-names>CN</given-names></name><etal/></person-group> <article-title>Incidence of high grade gliomas presenting as radiographically non-enhancing lesions: experience in 111 surgically treated non-enhancing gliomas with tissue diagnosis</article-title>. <source>J Neurooncol</source>. (<year>2020</year>) <volume>147</volume>(<issue>3</issue>):<fpage>671</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1007/s11060-020-03474-z</pub-id><pub-id pub-id-type="pmid">32221785</pub-id></mixed-citation></ref>
<ref id="B8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Krigers</surname> <given-names>A</given-names></name> <name><surname>Demetz</surname> <given-names>M</given-names></name> <name><surname>Grams</surname> <given-names>AE</given-names></name> <name><surname>Thom&#x00E9;</surname> <given-names>C</given-names></name> <name><surname>Freyschlag</surname> <given-names>CF</given-names></name></person-group>. <article-title>The diagnostic value of contrast enhancement on MRI in diffuse and anaplastic gliomas</article-title>. <source>Acta Neurochir</source>. (<year>2022</year>) <volume>164</volume>(<issue>8</issue>):<fpage>2035</fpage>&#x2013;<lpage>40</lpage>. <pub-id pub-id-type="doi">10.1007/s00701-021-05103-8</pub-id><pub-id pub-id-type="pmid">35018531</pub-id></mixed-citation></ref>
<ref id="B9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Y</given-names></name> <name><surname>Mathis</surname> <given-names>AM</given-names></name> <name><surname>Pollo</surname> <given-names>B</given-names></name> <name><surname>Schlegel</surname> <given-names>J</given-names></name> <name><surname>Maragkou</surname> <given-names>T</given-names></name> <name><surname>Seidel</surname> <given-names>K</given-names></name><etal/></person-group> <article-title>Intraoperative in vivo confocal laser endomicroscopy imaging at glioma margins: can we detect tumor infiltration?</article-title> <source>J Neurosurg</source>. (<year>2024</year>) <volume>140</volume>(<issue>2</issue>):<fpage>357</fpage>&#x2013;<lpage>66</lpage>. <pub-id pub-id-type="doi">10.3171/2023.5.JNS23546</pub-id><pub-id pub-id-type="pmid">37542440</pub-id></mixed-citation></ref>
<ref id="B10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Belykh</surname> <given-names>E</given-names></name> <name><surname>Shaffer</surname> <given-names>KV</given-names></name> <name><surname>Lin</surname> <given-names>C</given-names></name> <name><surname>Byvaltsev</surname> <given-names>VA</given-names></name> <name><surname>Preul</surname> <given-names>MC</given-names></name> <name><surname>Chen</surname> <given-names>L</given-names></name></person-group>. <article-title>Blood-brain barrier, blood-brain tumor barrier, and fluorescence-guided neurosurgical oncology: delivering optical labels to brain tumors</article-title>. <source>Front Oncol</source>. (<year>2020</year>) <volume>10</volume>:<fpage>739</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2020.00739</pub-id><pub-id pub-id-type="pmid">32582530</pub-id></mixed-citation></ref>
<ref id="B11"><label>11.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Izadyyazdanabadi</surname> <given-names>M</given-names></name> <name><surname>Belykh</surname> <given-names>E</given-names></name> <name><surname>Cavallo</surname> <given-names>C</given-names></name> <name><surname>Zhao</surname> <given-names>X</given-names></name> <name><surname>Gandhi</surname> <given-names>S</given-names></name> <name><surname>Moreira</surname> <given-names>LB</given-names></name><etal/></person-group> <article-title>Weakly-supervised learning-based feature localization for confocal laser endomicroscopy glioma images</article-title>. <conf-name>International Conference on Medical Image Computing and Computer-Assisted Intervention</conf-name>; <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name> (<year>2018</year>). p. <fpage>300</fpage>&#x2013;<lpage>8</lpage>.</mixed-citation></ref>
<ref id="B12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Izadyyazdanabadi</surname> <given-names>M</given-names></name> <name><surname>Belykh</surname> <given-names>E</given-names></name> <name><surname>Zhao</surname> <given-names>X</given-names></name> <name><surname>Moreira</surname> <given-names>LB</given-names></name> <name><surname>Gandhi</surname> <given-names>S</given-names></name> <name><surname>Cavallo</surname> <given-names>C</given-names></name><etal/></person-group> <article-title>Fluorescence image histology pattern transformation using image style transfer</article-title>. <source>Front Oncol</source>. (<year>2019</year>) <volume>9</volume>:<fpage>519</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2019.00519</pub-id><pub-id pub-id-type="pmid">31293966</pub-id></mixed-citation></ref>
<ref id="B13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Izadyyazdanabadi</surname> <given-names>M</given-names></name> <name><surname>Belykh</surname> <given-names>E</given-names></name> <name><surname>Mooney</surname> <given-names>M</given-names></name> <name><surname>Martirosyan</surname> <given-names>N</given-names></name> <name><surname>Eschbacher</surname> <given-names>J</given-names></name> <name><surname>Nakaji</surname> <given-names>P</given-names></name><etal/></person-group> <article-title>Convolutional neural networks: ensemble modeling, fine-tuning and unsupervised semantic localization for neurosurgical CLE images</article-title>. <source>J Vis Commun Image Represent</source>. (<year>2018</year>) <volume>54</volume>:<fpage>10</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1016/j.jvcir.2018.04.004</pub-id></mixed-citation></ref>
<ref id="B14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ziebart</surname> <given-names>A</given-names></name> <name><surname>Stadniczuk</surname> <given-names>D</given-names></name> <name><surname>Roos</surname> <given-names>V</given-names></name> <name><surname>Ratliff</surname> <given-names>M</given-names></name> <name><surname>von Deimling</surname> <given-names>A</given-names></name> <name><surname>H&#x00E4;nggi</surname> <given-names>D</given-names></name><etal/></person-group> <article-title>Deep neural network for differentiation of brain tumor tissue displayed by confocal laser endomicroscopy</article-title>. <source>Front Oncol</source>. (<year>2021</year>) <volume>11</volume>:<fpage>668273</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2021.668273</pub-id><pub-id pub-id-type="pmid">34046358</pub-id></mixed-citation></ref>
<ref id="B15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aubreville</surname> <given-names>M</given-names></name> <name><surname>Knipfer</surname> <given-names>C</given-names></name> <name><surname>Oetter</surname> <given-names>N</given-names></name> <name><surname>Jaremenko</surname> <given-names>C</given-names></name> <name><surname>Rodner</surname> <given-names>E</given-names></name> <name><surname>Denzler</surname> <given-names>J</given-names></name><etal/></person-group> <article-title>Automatic classification of cancerous tissue in laserendomicroscopy images of the oral cavity using deep learning</article-title>. <source>Sci Rep</source>. (<year>2017</year>) <volume>7</volume>(<issue>1</issue>):<fpage>11979</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-017-12320-8</pub-id><pub-id pub-id-type="pmid">28931888</pub-id></mixed-citation></ref>
<ref id="B16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Y</given-names></name> <name><surname>On</surname> <given-names>TJ</given-names></name> <name><surname>Abramov</surname> <given-names>I</given-names></name> <name><surname>Alcantar-Garibay</surname> <given-names>O</given-names></name> <name><surname>Hartke</surname> <given-names>JN</given-names></name> <name><surname>Eschbacher</surname> <given-names>JM</given-names></name><etal/></person-group> <article-title>A single-institution experience with intraoperative in vivo confocal laser endomicroscopy for brain tumors in 50 patients</article-title>. <source>Front Oncol</source>. (<year>2025</year>) <volume>15</volume>:<fpage>1565935</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2025.1565935</pub-id><pub-id pub-id-type="pmid">40438692</pub-id></mixed-citation></ref>
<ref id="B17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Martirosyan</surname> <given-names>NL</given-names></name> <name><surname>Eschbacher</surname> <given-names>JM</given-names></name> <name><surname>Kalani</surname> <given-names>MY</given-names></name> <name><surname>Turner</surname> <given-names>JD</given-names></name> <name><surname>Belykh</surname> <given-names>E</given-names></name> <name><surname>Spetzler</surname> <given-names>RF</given-names></name><etal/></person-group> <article-title>Prospective evaluation of the utility of intraoperative confocal laser endomicroscopy in patients with brain neoplasms using fluorescein sodium: experience with 74 cases</article-title>. <source>Neurosurg Focus</source>. (<year>2016</year>) <volume>40</volume>(<issue>3</issue>):<fpage>E11</fpage>. <pub-id pub-id-type="doi">10.3171/2016.1.FOCUS15559</pub-id><pub-id pub-id-type="pmid">26926051</pub-id></mixed-citation></ref>
<ref id="B18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cho</surname> <given-names>H</given-names></name> <name><surname>Moon</surname> <given-names>D</given-names></name> <name><surname>Heo</surname> <given-names>SM</given-names></name> <name><surname>Chu</surname> <given-names>J</given-names></name> <name><surname>Bae</surname> <given-names>H</given-names></name> <name><surname>Choi</surname> <given-names>S</given-names></name><etal/></person-group> <article-title>Artificial intelligence-based real-time histopathology of gastric cancer using confocal laser endomicroscopy</article-title>. <source>NPJ Precis Oncol</source>. (<year>2024</year>) <volume>8</volume>(<issue>1</issue>):<fpage>131</fpage>. <pub-id pub-id-type="doi">10.1038/s41698-024-00621-x</pub-id><pub-id pub-id-type="pmid">38877301</pub-id></mixed-citation></ref>
<ref id="B19"><label>19.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>J</given-names></name> <name><surname>Dong</surname> <given-names>W</given-names></name> <name><surname>Socher</surname> <given-names>R</given-names></name> <name><surname>Li</surname> <given-names>LJ</given-names></name> <name><surname>Li</surname> <given-names>K</given-names></name> <name><surname>Fei-Fei</surname> <given-names>L</given-names></name></person-group>. <article-title>Imagenet: a large-scale hierarchical image database</article-title>. <conf-name>2009 IEEE Conference on Computer Vision and Pattern Recognition</conf-name> (<year>2009</year>). p. <fpage>248</fpage>&#x2013;<lpage>55</lpage></mixed-citation></ref>
<ref id="B20"><label>20.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Recht</surname> <given-names>B</given-names></name> <name><surname>Roelofs</surname> <given-names>R</given-names></name> <name><surname>Schmidt</surname> <given-names>L</given-names></name> <name><surname>Shankar</surname> <given-names>V</given-names></name></person-group>. <article-title>Do ImageNet classifiers generalize to imagenet?</article-title> <conf-name>International Conference on Machine Learning: PMLR</conf-name> (<year>2019</year>). p. <fpage>5389</fpage>&#x2013;<lpage>400</lpage></mixed-citation></ref>
<ref id="B21"><label>21.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Szegedy</surname> <given-names>C</given-names></name> <name><surname>Ioffe</surname> <given-names>S</given-names></name> <name><surname>Vanhoucke</surname> <given-names>V</given-names></name> <name><surname>Alemi</surname> <given-names>A</given-names></name></person-group>. <article-title>Inception-v4, Inception-resnet and the impact of residual connections on learning</article-title>. <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name> (<year>2017</year>).</mixed-citation></ref>
<ref id="B22"><label>22.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Ren</surname> <given-names>S</given-names></name> <name><surname>Sun</surname> <given-names>J</given-names></name></person-group>. <article-title>Deep residual learning for image recognition</article-title>. <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name> (<year>2016</year>). p. <fpage>770</fpage>&#x2013;<lpage>8</lpage></mixed-citation></ref>
<ref id="B23"><label>23.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Simonyan</surname> <given-names>K</given-names></name> <name><surname>Zisserman</surname> <given-names>A</given-names></name></person-group>. <article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <comment><italic>arXiv</italic> [Preprint] <italic>arXiv:1409.1556</italic> (2014)</comment>.</mixed-citation></ref>
<ref id="B24"><label>24.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname> <given-names>A</given-names></name> <name><surname>Beyer</surname> <given-names>L</given-names></name> <name><surname>Kolesnikov</surname> <given-names>A</given-names></name> <name><surname>Weissenborn</surname> <given-names>D</given-names></name> <name><surname>Zhai</surname> <given-names>X</given-names></name> <name><surname>Unterthiner</surname> <given-names>T</given-names></name><etal/></person-group> <article-title>An image is worth 16&#x00D7;16 words: transformers for image recognition at scale</article-title>. <comment><italic>arXiv</italic> [Preprint] <italic>arXiv:2010.11929</italic> (2020)</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id></mixed-citation></ref>
<ref id="B25"><label>25.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A</given-names></name> <name><surname>Shazeer</surname> <given-names>N</given-names></name> <name><surname>Parmar</surname> <given-names>N</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J</given-names></name> <name><surname>Jones</surname> <given-names>L</given-names></name> <name><surname>Gomez</surname> <given-names>AN</given-names></name><etal/></person-group> <article-title>Attention is all you need</article-title>. <source>arXiv <italic>[Preprint]</italic>. arXiv:1706.03762</source>. (<year>2017</year>). <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></mixed-citation></ref>
<ref id="B26"><label>26.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Bai</surname> <given-names>S</given-names></name> <name><surname>Kolter</surname> <given-names>JZ</given-names></name> <name><surname>Koltun</surname> <given-names>V</given-names></name></person-group>. <article-title>An empirical evaluation of generic convolutional and recurrent networks for sequence modeling</article-title>. <comment><italic>arXiv</italic> [Preprint] <italic>arXiv:1803.01271</italic> (2018)</comment>.</mixed-citation></ref>
<ref id="B27"><label>27.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Izadyyazdanabadi</surname> <given-names>M</given-names></name> <name><surname>Belykh</surname> <given-names>E</given-names></name> <name><surname>Martirosyan</surname> <given-names>N</given-names></name> <name><surname>Eschbacher</surname> <given-names>J</given-names></name> <name><surname>Nakaji</surname> <given-names>P</given-names></name> <name><surname>Yang</surname> <given-names>Y</given-names></name><etal/></person-group> <article-title>Improving utility of brain tumor confocal laser endomicroscopy: objective value assessment and diagnostic frame detection with convolutional neural networks</article-title>. <conf-name>SPIE</conf-name> (<year>2017</year>).</mixed-citation></ref>
<ref id="B28"><label>28.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hochreiter</surname> <given-names>S</given-names></name> <name><surname>Schmidhuber</surname> <given-names>J</given-names></name></person-group>. <article-title>Long short-term memory</article-title>. <source>Neural Comput</source>. (<year>1997</year>) <volume>9</volume>(<issue>8</issue>):<fpage>1735</fpage>&#x2013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id><pub-id pub-id-type="pmid">9377276</pub-id></mixed-citation></ref>
<ref id="B29"><label>29.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abramov</surname> <given-names>I</given-names></name> <name><surname>Park</surname> <given-names>MT</given-names></name> <name><surname>Belykh</surname> <given-names>E</given-names></name> <name><surname>Dru</surname> <given-names>AB</given-names></name> <name><surname>Xu</surname> <given-names>Y</given-names></name> <name><surname>Gooldy</surname> <given-names>TC</given-names></name><etal/></person-group> <article-title>Intraoperative confocal laser endomicroscopy: prospective in vivo feasibility study of a clinical-grade system for brain tumors</article-title>. <source>J Neurosurg</source>. (<year>2023</year>) <volume>138</volume>(<issue>3</issue>):<fpage>587</fpage>&#x2013;<lpage>97</lpage>. <pub-id pub-id-type="doi">10.3171/2022.5.JNS2282</pub-id><pub-id pub-id-type="pmid">35901698</pub-id></mixed-citation></ref>
<ref id="B30"><label>30.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Muscas</surname> <given-names>G</given-names></name> <name><surname>Visocchi</surname> <given-names>E</given-names></name> <name><surname>Parenti</surname> <given-names>A</given-names></name> <name><surname>Capelli</surname> <given-names>F</given-names></name> <name><surname>Petti</surname> <given-names>M</given-names></name> <name><surname>Esposito</surname> <given-names>A</given-names></name><etal/></person-group> <article-title>Operative microscope in-field visualization of confocal laser endomicroscopy interface (Zeiss CONVIVO&#x00AE;)</article-title>. <source>Oper Neurosurg</source>. (<year>2025</year>) <volume>29</volume>:<fpage>860</fpage>&#x2013;<lpage>4</lpage>. <pub-id pub-id-type="doi">10.1227/ons.0000000000001560</pub-id><pub-id pub-id-type="pmid">40198207</pub-id></mixed-citation></ref>
<ref id="B31"><label>31.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>MT</given-names></name> <name><surname>Abramov</surname> <given-names>I</given-names></name> <name><surname>Gooldy</surname> <given-names>TC</given-names></name> <name><surname>Smith</surname> <given-names>KA</given-names></name> <name><surname>Porter</surname> <given-names>RW</given-names></name> <name><surname>Little</surname> <given-names>AS</given-names></name><etal/></person-group> <article-title>Introduction of in vivo confocal laser endomicroscopy and real-time telepathology for remote intraoperative neurosurgery-pathology consultation</article-title>. <source>Oper Neurosurg</source>. (<year>2022</year>) <volume>23</volume>(<issue>3</issue>):<fpage>261</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1227/ons.0000000000000288</pub-id><pub-id pub-id-type="pmid">35972091</pub-id></mixed-citation></ref></ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/609198/overview">Mario Ammirati</ext-link>, Temple University, United States</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1098547/overview">Julius H&#x00F6;hne</ext-link>, Paracelsus Medical Private University, Nuremberg, Germany</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1167194/overview">Francesco Restelli</ext-link>, IRCCS Carlo Besta Neurological Institute Foundation, Italy</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1628177/overview">Francesco Carbone</ext-link>, St&#x00E4;dtisches Klinikum Karlsruhe, Germany</p></fn>
</fn-group>
<fn-group>
<fn fn-type="abbr" id="abbrev1"><label>Abbreviations:</label><p>AI, artificial intelligence; CLE, confocal laser endomicroscopy; CNS, central nervous system; FNa, fluorescein sodium; GPU, graphics processing unit; H&#x0026;E, hematoxylin and eosin; MRI, magnetic resonance imaging; ROI, region of interest; WHO, World Health Organization.</p></fn>
</fn-group>
</back>
</article>