<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Oncol.</journal-id>
<journal-title>Frontiers in Oncology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Oncol.</abbrev-journal-title>
<issn pub-type="epub">2234-943X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fonc.2025.1539845</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Oncology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Predicting overall survival in glioblastoma patients using machine learning: an analysis of treatment efficacy and patient prognosis</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Onciul</surname>
<given-names>Razvan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2921023/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Brehar</surname>
<given-names>Felix-Mircea</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2352775/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Dumitru</surname>
<given-names>Adrian Vasile</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Crivoi</surname>
<given-names>Carla</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Covache-Busuioc</surname>
<given-names>Razvan-Adrian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2578125/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Serban</surname>
<given-names>Matei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2914139/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Radoi</surname>
<given-names>Petrinel Mugurel</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Toader</surname>
<given-names>Corneliu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Neurosurgery, &#x201c;Carol Davila&#x201d; University of Medicine and Pharmacy</institution>, <addr-line>Bucharest</addr-line>, <country>Romania</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Neurosurgery Department, Emergency University Hospital</institution>, <addr-line>Bucharest</addr-line>, <country>Romania</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Neurosurgery, Clinical Emergency Hospital &#x201c;Bagdasar-Arseni&#x201d;</institution>, <addr-line>Bucharest</addr-line>, <country>Romania</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Pathology, University Emergency Hospital Bucharest, Carol Davila University of Medicine and Pharmacy</institution>, <addr-line>Bucharest</addr-line>, <country>Romania</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Department of Computer Science, Faculty of Mathematics and Computer Science, University of Bucharest</institution>, <addr-line>Bucharest</addr-line>, <country>Romania</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Department of Vascular Neurosurgery, National Institute of Neurology and Neurovascular Diseases</institution>, <addr-line>Bucharest</addr-line>, <country>Romania</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Abhishek Mahajan, The Clatterbridge Cancer Centre, United Kingdom</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Zesheng Li, Tianjin Neurological Institute, China</p>
<p>Joselyn Esther Zapata Paulini, Continental University, Peru</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Felix-Mircea Brehar, <email xlink:href="mailto:felix.brehar@umfcd.ro">felix.brehar@umfcd.ro</email>; Adrian Vasile Dumitru, <email xlink:href="mailto:vasile.dumitru@umfcd.ro">vasile.dumitru@umfcd.ro</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>04</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>15</volume>
<elocation-id>1539845</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>12</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>03</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Onciul, Brehar, Dumitru, Crivoi, Covache-Busuioc, Serban, Radoi and Toader</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Onciul, Brehar, Dumitru, Crivoi, Covache-Busuioc, Serban, Radoi and Toader</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Glioblastoma (GBM), the most aggressive primary brain tumor, poses a significant challenge in predicting patient survival due to its heterogeneity and resistance to treatment. Accurate survival prediction is essential for optimizing treatment strategies and improving clinical outcomes.</p>
</sec>
<sec>
<title>Methods</title>
<p>This study utilized metadata from 135 GBM patients, including demographic, clinical, and molecular variables such as age, Karnofsky Performance Status (KPS), MGMT promoter methylation, and EGFR amplification. Six machine learning models&#x2014;XGBoost, Random Forests, Support Vector Machines, Artificial Neural Networks, Extra Trees Regressor, and K- Nearest Neighbors&#x2014;were employed to classify patients into predefined survival categories. Data preprocessing included label encoding for categorical variables and MinMax scaling for numerical features. Model performance was assessed using ROC-AUC and accuracy metrics, with hyperparameters optimized through grid search.</p>
</sec>
<sec>
<title>Results</title>
<p>XGBoost demonstrated the highest predictive accuracy, achieving a mean ROC-AUC of 0.90 and an accuracy of 0.78. Ensemble models outperformed simpler classifiers, emphasizing the predictive value of metadata. The models identified key prognostic markers, including MGMT promoter methylation and KPS, as significant contributors to survival prediction.</p>
</sec>
<sec>
<title>Conclusions</title>
<p>The application of machine learning to GBM metadata offers a robust approach to predicting patient survival. The study highlights the potential of ML models to enhance clinical decision-making and contribute to personalized treatment strategies, with a focus on accuracy, reliability, and interpretability.</p>
</sec>
</abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>prognostic biomarkers</kwd>
<kwd>explainable AI</kwd>
<kwd>survival prediction</kwd>
<kwd>clinical decision support</kwd>
<kwd>personalized medicine</kwd>
<kwd>predictive modeling</kwd>
</kwd-group>
<counts>
<fig-count count="10"/>
<table-count count="2"/>
<equation-count count="1"/>
<ref-count count="21"/>
<page-count count="12"/>
<word-count count="4542"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Neuro-Oncology and Neurosurgical Oncology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Glioblastoma (GBM) remains the most aggressive and fatal primary brain tumor in adults, with a median survival of just 15 months despite advancements in surgical, radiotherapeutic, and chemotherapeutic interventions. This stark prognosis is driven by GBM&#x2019;s inherent heterogeneity and resistance to treatment, making precise prognostic assessments critical yet elusive (<xref ref-type="bibr" rid="B1">1</xref>). Traditional methods often fail to fully capture the intricate biological and clinical interplay that shapes patient outcomes, paving the way for innovative computational approaches to address this gap (<xref ref-type="bibr" rid="B2">2</xref>).</p>
<p>Machine learning (ML) has introduced a transformative perspective in predicting GBM survival by leveraging multidimensional data. Recent advancements in ML enable the integration of diverse inputs, such as genetic markers, epigenetic profiles, and clinical variables, into predictive models (<xref ref-type="bibr" rid="B3">3</xref>). These systems have moved beyond conventional tools, offering individualized survival estimates that reflect the complexity of GBM biology. Multimodal approaches have proven especially impactful, synthesizing molecular data with imaging and clinical parameters to deliver nuanced, patient-specific insights (<xref ref-type="bibr" rid="B4">4</xref>).</p>
<p>Radiomics, a rapidly evolving field, has further enhanced survival prediction by unlocking the potential of standard imaging techniques. Through the extraction of high-dimensional features from MRI scans, radiomics reveals patterns linked to tumor progression and microenvironment characteristics (<xref ref-type="bibr" rid="B5">5</xref>). When combined with deep learning, these features have become powerful prognostic indicators, offering an unprecedented level of precision and interpretability. Such approaches are not only predictive but also uncover new biological connections, linking imaging characteristics to molecular and clinical outcomes (<xref ref-type="bibr" rid="B6">6</xref>).</p>
<p>In parallel, machine learning models have demonstrated the ability to address challenges related to data variability and interpretability. By leveraging advanced algorithms, these models can effectively identify key survival factors and achieve robust predictive accuracy. This approach has facilitated the identification of critical variables, offering actionable insights to support the development of tailored treatment strategies (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>Despite these advances, challenges remain, including data imbalance and the need for broader validation across diverse cohorts. Innovative solutions, such as data augmentation and transfer learning, are actively addressing these barriers, pushing the boundaries of what ML can achieve in clinical settings (<xref ref-type="bibr" rid="B8">8</xref>).</p>
<p>This study builds on these novel developments, harnessing advanced ML methods to create predictive models that incorporate diverse data modalities. By addressing existing limitations and enhancing interpretability, this work aims to improve survival predictions and contribute to a more personalized approach to GBM management.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Background</title>
<p>GBM is a highly aggressive brain tumor with poor outcomes, presenting unique challenges for accurate prognosis and treatment planning. Traditional approaches often struggle to account for the complex biological and clinical variability inherent to GBM, necessitating advanced methodologies to improve survival predictions (<xref ref-type="bibr" rid="B9">9</xref>). ML has emerged as a promising tool to address these gaps, leveraging diverse datasets to provide more personalized and precise prognostic insights.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Advances in machine learning for GBM prognosis</title>
<p>The use of radiomics, a field focused on extracting detailed imaging features, has significantly enhanced the prognostic capabilities of ML. By analyzing high-dimensional data from MRI scans, radiomics enables models to detect subtle patterns related to tumor behavior and patient outcomes (<xref ref-type="bibr" rid="B10">10</xref>). For example, ML models incorporating imaging-derived metrics, such as texture and shape features, have demonstrated considerable accuracy in predicting survival times. These approaches are especially valuable for their non-invasive nature and ability to complement existing clinical evaluations (<xref ref-type="bibr" rid="B11">11</xref>).</p>
<p>Beyond imaging, molecular profiling has emerged as a critical component in GBM prognosis. Genomic and transcriptomic data have proven essential for identifying key survival markers, such as MGMT promoter methylation and IDH mutation status. The integration of molecular data into ML frameworks has facilitated more nuanced stratifications of patient outcomes, offering insights that align closely with tumor heterogeneity (<xref ref-type="bibr" rid="B12">12</xref>). Multi-omics approaches, which combine molecular, proteomic, and clinical information, further enhance predictive accuracy by capturing a holistic view of the disease (<xref ref-type="bibr" rid="B13">13</xref>).</p>
<p>Multimodal frameworks that combine radiomic, molecular, and clinical data have demonstrated exceptional potential for survival prediction. Ensemble learning algorithms, such as gradient boosting and random forests, excel in synthesizing disparate data types to uncover predictive patterns. These models are particularly effective in handling data variability and prioritizing key survival factors, making them reliable tools for GBM prognosis (<xref ref-type="bibr" rid="B14">14</xref>).</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Addressing challenges in ML applications</title>
<p>Despite the advancements, several obstacles remain in applying ML to GBM prognosis. One of the most pressing issues is class imbalance, where long-term survival categories are underrepresented in datasets (<xref ref-type="bibr" rid="B15">15</xref>). Techniques such as Synthetic Minority Oversampling (SMOTE) have been employed to address this imbalance, enhancing model robustness and improving predictions for minority classes. Additionally, the interpretability of complex ML models poses a challenge for clinical adoption (<xref ref-type="bibr" rid="B16">16</xref>). Emerging tools like SHapley Additive exPlanations (SHAP) and Local Interpretable Model-agnostic Explanations (LIME) are bridging this gap by elucidating the contributions of individual features to model predictions, fostering greater trust and usability in clinical contexts (<xref ref-type="bibr" rid="B17">17</xref>, <xref ref-type="bibr" rid="B18">18</xref>).</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Relevance of current study</title>
<p>This study aims to build on these advancements by integrating molecular, and clinical data into an advanced ML framework to improve GBM survival predictions. By addressing challenges such as data imbalance and interpretability, this research seeks to develop robust, transparent models that are both accurate and clinically applicable. The results aim to contribute to more personalized approaches in GBM management, advancing the integration of ML into routine oncology practice.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Data description</title>
<sec id="s3_1">
<label>3.1</label>
<title>Data Collection and Filtering</title>
<p>The dataset for this study began with 17 columns, encompassing various clinical, demographic, molecular, and treatment-related variables. To refine the dataset, we focused on patients who underwent surgical resection and had either radiotherapy or chemotherapy as part of their treatment. After applying these inclusion criteria, the final cohort consisted of 135 patients. This carefully filtered dataset provided a robust foundation for analyzing survival outcomes and training machine learning models.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Dataset overview</title>
<p>The dataset includes features essential for understanding glioblastoma prognosis, grouped into the following categories:</p>
<p>1. Demographic Features:</p>
<list list-type="simple">
<list-item>
<p>Age: The patient&#x2019;s age at diagnosis, recorded as a continuous variable.</p>
</list-item>
<list-item>
<p>Gender: A categorical variable (male/female), converted into numerical format for analysis.</p>
</list-item>
</list>
<p>2. Clinical Features:</p>
<list list-type="simple">
<list-item>
<p>Karnofsky Performance Status (KPS): A score that evaluates the patient&#x2019;s physical ability and functional independence.</p>
</list-item>
<list-item>
<p>Overall Survival (OS): The primary outcome variable, categorized into five survival classes:</p>
</list-item>
</list>
<p>&#x2022; 0&#x2013;2 months</p>
<p>&#x2022; 3&#x2013;8 months</p>
<p>&#x2022; 9&#x2013;18 months</p>
<p>&#x2022; 19&#x2013;24 months</p>
<p>&#x2022; More than 24 months</p>
<p>3. Treatment Details:</p>
<list list-type="simple">
<list-item>
<p>Radiotherapy: Indicates whether the patient received radiotherapy. Chemotherapy: Indicates whether the patient underwent chemotherapy.</p>
</list-item>
<list-item>
<p>Surgical Resection: Indicates whether the patient had a surgical procedure to remove the tumor.</p>
</list-item>
</list>
<p>4. Molecular Biomarkers:</p>
<list list-type="simple">
<list-item>
<p>MGMT Promoter Methylation: A binary marker associated with the tumor&#x2019;s sensitivity to treatment.</p>
</list-item>
<list-item>
<p>EGFR Amplification: A binary marker linked to tumor growth and progression.</p>
</list-item>
</list>
<p>Features for Machine Learning Models</p>
<p>From the dataset, a selection of key features was made to train machine learning models effectively:</p>
<list list-type="bullet">
<list-item>
<p>Demographic Data: Age and gender.</p>
</list-item>
<list-item>
<p>Clinical Features: KPS score and categorized overall survival as the target variable.</p>
</list-item>
<list-item>
<p>Treatment Information: Whether the patient received radiotherapy, chemotherapy, or surgical resection.</p>
</list-item>
<list-item>
<p>Molecular Markers: MGMT promoter methylation and EGFR amplification.</p>
</list-item>
</list>
<p>This selection captures a holistic view of each patient, ensuring that the models are equipped to analyze the multifaceted factors influencing glioblastoma outcomes.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Data preparation</title>    <p>To prepare the data for machine learning, several preprocessing steps were implemented:</p>
<list list-type="bullet">
<list-item>
<p>Categorical Encoding: Variables like gender and molecular biomarkers were converted into numerical values for compatibility with ML algorithms.</p>
</list-item>
<list-item>
<p>Normalization: Continuous variables such as age, KPS score, and imaging features were normalized to ensure all inputs had comparable scales.</p>
</list-item>
<list-item>
<p>Classification of Survival: The overall survival variable was divided into discrete categories, enabling classification-based machine learning methods.</p>
</list-item>
</list>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Final dataset characteristics</title>
<p>The final dataset comprised 135 patients with selected features spanning demographic, clinical, molecular, and imaging data. This dataset provided the basis for developing machine learning models aimed at accurately predicting survival outcomes and aiding in personalized treatment strategies for glioblastoma patients.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Data preprocessing</title>
<p>In the dataset derived from the clinical study on glioblastoma, an essential preprocessing step involved converting all textual or categorical variables into numeric formats. This transformation was accomplished using label encoding, a technique where each unique text value in a column is assigned a numerical label. This process included encoding variables that describe medical interventions, like types of surgery or chemotherapy, as well as genetic features such as MGMT promoter methylation status and EGFR amplification.</p>
<p>Additionally, the numerical variables in the dataset, specifically age and Karnofsky Performance Status (KPS), underwent scaling using the MinMax method. This method transforms the data into a specified range, for us 0 to 1, by subtracting the minimum value of each feature and then dividing by the range of the feature. The formula for MinMax scaling is:</p>
<disp-formula>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>V</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>V</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>V</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>V</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>V</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Scaling is an important method in data preprocessing because it brings uniformity to different features, ensuring that no feature dominates others in magnitude, which can affect the performance of many machine learning models. Second, it improves the convergence speed of the algorithm because most machine learning algorithms perform better when the numerical input values vary similarly.</p>
<p>By normalizing the data across the entire dataset through these methods, the processed data becomes more suitable for predictive modeling, enhancing the efficiency and accuracy of the machine learning models developed from this dataset. These steps ensure that the data analysis is robust, providing reliable insights into the treatment outcomes and potential prognostic factors in glioblastoma patients.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Data preparation and survival analysis framework</title>
<p>In the clinical dataset focusing on glioblastoma, the primary outcome of interest is OS, which quantifies the duration a patient lives following their diagnosis. This target variable is pivotal for assessing the efficacy of various treatments and for making predictions about patient prognosis.</p>
<p>The dataset categorizes Overall Survival into five distinct classes based on the number of months a patient survives post-diagnosis, starting with the 0-2 months range, followed by 3-8 months, then 9-18 months, 19-24 months, and finally, more than 24 months.</p>
<p>These classes enable the machine learning models to handle survival data as a categorical variable, which simplifies the modeling of survival distributions across different patient groups.</p>
<p>The training set of the dataset reveals a distribution of patients across the survival classes, with 7 patients in Class 0 (0-2 months), 37 patients in Class 1 (3-8 months), 46 patients in Class 2 (9-18 months), and 2 patients each in Class 3 (19-24 months) and Class 4 (more than 24 months).</p>
<p>This distribution helps in understanding the model&#x2019;s learning capacity across a varied range of survival outcomes, although it highlights an imbalance in the dataset with fewer representatives in the longer survival categories.</p>
<p>The test set, used to evaluate the performance of the predictive model, showing 5 patients in Class 0 (0-2 months), 16 patients in Class 1 (3-8 months), 16 patients in Class 2 (9-18 months), 2 patients each in Class 3 (19-24 months) and Class 4 (more than 24 months).</p>
<p>This distribution indicates how the model will be tested against unseen data, offering insights into its generalized performance across different survival times.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Models training</title>
<p>In the study focusing on glioblastoma patient survival, six types of machine learning models were employed to predict outcomes. These models include Artificial Neural Networks (ANN), Extra Trees Regressor (ETR), K-Nearest Neighbors (KNN), Random Forest (RF), Support Vector Machines (SVM), and XGBoost Regressor (XGBR). Each model brings a distinct approach to handling the data and making predictions, leveraging their unique strengths to potentially improve the accuracy of survival time predictions.</p>
<p>All models underwent a fine-tuning process to optimize their parameters, ensuring the best possible performance (<xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>). This fine-tuning was performed using grid search, a systematic approach to hyperparameter optimization. Grid search iteratively evaluates combinations of hyperparameter values to identify the configuration that delivers the best performance for each model. The process ensures that the models are well-calibrated to the dataset, avoiding underfitting or overfitting.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Presents the optimized hyperparameter settings for each of the six machine learning models used in this study.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Classifier</th>
<th valign="top" align="left">Hyperparameters</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">Hidden Layers:<break/>&#x2003;- First layer: 32 neurons, ReLU activation<break/>&#x2003;- Second layer: 32 neurons, ReLU activation Output Layer: 5 neurons, Softmax activation Optimizer: Adam<break/>Loss Function: Categorical Crossentropy Epochs: 50<break/>Batch Size: 16</td>
</tr>
<tr>
<td valign="top" align="left">SVM</td>
<td valign="top" align="left">Kernel: &#x2018;poly&#x2019;</td>
</tr>
<tr>
<td valign="top" align="left">XGB</td>
<td valign="top" align="left">Objective: binary:logistic Column Sample By Tree: 0.5 Learning Rate: 0.1<break/>Max Depth: 100<break/>Alpha: 1<break/>Number of Estimators: 50</td>
</tr>
<tr>
<td valign="top" align="left">RF</td>
<td valign="top" align="left">n_estimators: 8</td>
</tr>
<tr>
<td valign="top" align="left">ETR</td>
<td valign="top" align="left">n_estimators: 5</td>
</tr>
<tr>
<td valign="top" align="left">KNN</td>
<td valign="top" align="left">n_neighbors: 30</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The ANN model, for example, consists of two hidden layers with ReLU activation functions and was trained using the Adam optimizer and categorical crossentropy loss. The XGBoost model was fine-tuned with parameters such as a maximum tree depth of 100 and a learning rate of 0.1, while simpler models like KNN and Random Forest used optimized settings for the number of neighbors and estimators, respectively. These fine-tuned configurations ensure that each model performs optimally when predicting glioblastoma patient survival outcomes.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s5" sec-type="results">
<label>5</label>
<title>Results</title>
<p>We evaluated the performance of six machine learning classifiers by analyzing their predictive accuracy and ROC-AUC on the test set. All reported performance metrics, including ROC-AUC and accuracy, were derived from the test set, ensuring the evaluation reflects the models&#x2019; ability to generalize to unseen data. The models were trained on the training set, and hyperparameters were optimized using grid search to avoid overfitting (<xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>).</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Provides a boxplot comparison of the Receiver Operating Characteristic Area Under the Curve (ROC-AUC) performance for all classifiers used in the study.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Classifier</th>
<th valign="top" align="left">ROC - AUC</th>
<th valign="top" align="left">Accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ANN</td>
<td valign="top" align="left">Mean: 0.73<break/>Standard deviation: 0.15</td>
<td valign="top" align="left">0.68</td>
</tr>
<tr>
<td valign="top" align="left">SVM</td>
<td valign="top" align="left">Mean: 0.84<break/>Standard deviation: 0.06</td>
<td valign="top" align="left">0.63</td>
</tr>
<tr>
<td valign="top" align="left">XGB</td>
<td valign="top" align="left">Mean: 0.90<break/>Standard deviation: 0.07</td>
<td valign="top" align="left">0.78</td>
</tr>
<tr>
<td valign="top" align="left">RF</td>
<td valign="top" align="left">Mean: 0.80<break/>Standard deviation: 0.12</td>
<td valign="top" align="left">0.66</td>
</tr>
<tr>
<td valign="top" align="left">ET</td>
<td valign="top" align="left">Mean: 0.82<break/>Standard deviation: 0.19</td>
<td valign="top" align="left">0.78</td>
</tr>
<tr>
<td valign="top" align="left">KNN</td>
<td valign="top" align="left">Mean: 0.79<break/>Standard deviation: 0.14</td>
<td valign="top" align="left">0.54</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>ROC-AUC is a key metric for evaluating the predictive accuracy of machine learning models, particularly for classification tasks involving imbalanced datasets.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In this study, we evaluated the robustness and efficiency of six machine learning classifiers by analyzing their ROC AUC scores across multiple iterations, possibly obtained through cross-validation or bootstrap resampling (<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>). This approach helps to gauge the performance stability and effectiveness of each classifier in predictive tasks.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The boxplot highlights the variation in ROC-AUC scores for each classifier. XGB demonstrates the highest median ROC-AUC score with minimal variability, followed by ET and SVM. In contrast, KNN exhibits higher variability and lower performance compared to other classifiers, suggesting sensitivity to the dataset&#x2019;s features.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g001.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> displays the ROC curves for XGBoost for predicting survival across all five classes (0&#x2013;2 months, 3&#x2013;8 months, 9&#x2013;18 months, 19&#x2013;24 months, and more than 24 months) using one of the evaluated models. The ROC curve illustrates the trade-off between the true positive rate (sensitivity) and the false positive rate for each survival class.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The figure highlights the model&#x2019;s performance in distinguishing between survival classes. The area under the curve (AUC) values for each class are annotated in the legend. Class 0 achieves perfect discrimination with an AUC of 1.00, while intermediate survival classes show moderate performance (e.g., Class 2 with an AUC of 0.89).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g002.tif"/>
</fig>
<p>The best results were achieved using an XGBoost algorithm, which attained an average ROC-AUC of 0.90 with a standard deviation of 0.07 and an accuracy of 0.78 on the test data. The next best outcomes were observed with an ET classifier, which demonstrated an ROC-AUC mean of 0.82, a standard deviation of 0.19, and achieved an accuracy of 0.78. Following the XGBoost and Ensemble Tree classifiers, the Support Vector Machine (SVM) algorithm also showed promising results with a mean ROC-AUC of 0.84 and a low standard deviation of 0.06, although its accuracy on the test data was slightly lower at 0.63. The RF classifier, with a mean ROC-AUC of 0.80 and a standard deviation of 0.12, achieved an accuracy of 0.66, demonstrating robustness albeit with a bit more variability in its performance compared to SVM.</p>
<p>The ANN model recorded a mean ROC-AUC of 0.73 and the highest standard deviation of 0.15 among the classifiers, alongside an accuracy of 0.68 on the test data, indicating less consistency in its predictive ability. Lastly, the KNN algorithm, while it had a decent mean ROC-AUC of 0.79 and a standard deviation of 0.14, showed the lowest test accuracy of 0.54, suggesting it might not be as effective in this particular setting compared to the other models (<xref ref-type="fig" rid="f3">
<bold>Figures&#xa0;3</bold>
</xref>&#x2013;<xref ref-type="fig" rid="f5">
<bold>5</bold>
</xref>).</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The RF ROC curves display class-wise prediction accuracy, with Class 0 achieving the highest AUC of 1.00. Intermediate classes (e.g., Class 1 and Class 2) demonstrate moderate predictive performance, with AUC values ranging between 0.81 and 0.82. The lower AUC for Class 3 (0.69) and Class 4 (0.71) indicates difficulty in distinguishing between these classes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g003.tif"/>
</fig>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The KNN ROC curves indicate variability in the model&#x2019;s performance across classes. Class 0 achieves a high AUC of 0.87, reflecting strong predictive capability for short-term survival. However, other classes, such as Class 3 (AUC 0.79) and Class 4 (AUC 0.94), show modest improvements over previous models.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g004.tif"/>
</fig>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>The figure shows that XGB and ET achieve the highest accuracy, with values close to 0.8, indicating robust predictive capabilities. In contrast, KNN records the lowest accuracy at approximately 0.5, highlighting its limitations for this dataset. The figure underscores the overall reliability of tree-based ensemble models compared to simpler classifiers such as KNN.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g005.tif"/>
</fig>
<p>These findings highlight the efficacy of XGB and ET classifiers in handling complex predictive tasks, with XGB slightly outperforming others in terms of stability and overall performance.</p>
<p>To further assess model performance, we analyzed the confusion matrices for both the training and test sets (<xref ref-type="fig" rid="f6">
<bold>Figures&#xa0;6</bold>
</xref>, <xref ref-type="fig" rid="f7">
<bold>7</bold>
</xref>). These matrices provide a detailed breakdown of how well each classifier distinguishes between the five survival classes (0&#x2013;2 months, 3&#x2013;8 months, 9&#x2013;18 months, 19&#x2013;24 months, and &gt;24 months).</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Training Confusion Matrices for Six Machine Learning Models. This figure presents the confusion matrices for six machine learning models (KNN, Random Forest (RF), Support Vector Machine (SVM), XGBoost, Extra Trees (ET), and Artificial Neural Networks (ANN)) trained on glioblastoma patient survival classification.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g006.tif"/>
</fig>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Test Set Confusion Matrices for Six Machine Learning Models. This figure displays the confusion matrices for six machine learning models (KNN, Random Forest (RF), Support Vector Machine (SVM), XGBoost, Extra Trees (ET), and Artificial Neural Networks (ANN)) when tested on unseen data for glioblastoma survival prediction. The matrices compare the true survival classes (y-axis) against the predicted labels (x-axis) for five survival categories.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g007.tif"/>
</fig>
<p>To understand the decision-making processes and feature prioritization of the evaluated models, we applied SHAP analysis to the KNN, XGBoost, and Extra Trees models. The SHAP summary plots (<xref ref-type="fig" rid="f8">
<bold>Figures&#xa0;8</bold>
</xref>&#x2013;<xref ref-type="fig" rid="f10">
<bold>10</bold>
</xref>) provide a comprehensive evaluation of the models&#x2019; interpretability, highlighting their respective strengths and limitations.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>SHAP summary plot for the KNN model. The limited differentiation of SHAP values across features highlights KNN&#x2019;s weak ability to prioritize key variables, contributing to its poor generalization and frequent misclassifications.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g008.tif"/>
</fig>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>SHAP summary plot for the XGBoost model. KPS is the most influential feature, followed by radiotherapy, age, and MGMT promoter methylation. The distinct separation of SHAP values demonstrates XGBoost&#x2019;s capacity to effectively prioritize important prognostic factors.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g009.tif"/>
</fig>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>SHAP summary plot for the Extra Trees model. KPS is the dominant feature, followed by age, radiotherapy, and MGMT promoter methylation. The Extra Trees model demonstrates strong feature differentiation, contributing to its robust predictive performance in glioblastoma survival prediction.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1539845-g010.tif"/>
</fig>
<p>The KNN model (<xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>) demonstrated limited feature differentiation, reflecting its inherent weakness in handling high-dimensional and imbalanced data. While KPS emerged as the most influential feature, the lack of distinct separation among other variables indicates that KNN struggled to assign proper weight to important factors such as MGMT promoter methylation and radiotherapy. This deficiency is consistent with KNN&#x2019;s frequent misclassifications and its poor generalization to the test set. Given its reliance on local data density and sensitivity to sparse distributions, KNN is not well-suited for complex clinical datasets like those involving glioblastoma patients. As such, we recommend that simpler models like KNN be replaced by ensemble-based methods for tasks involving high-dimensional and heterogeneous medical data.</p>
<p>In contrast, the XGBoost model (<xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>) exhibited strong and consistent feature prioritization, which explains its superior predictive performance. KPS was identified as the most critical factor, followed closely by radiotherapy, age, and MGMT promoter methylation. These results align with established clinical knowledge, as higher KPS scores and positive MGMT promoter methylation are associated with improved survival outcomes in glioblastoma patients.</p>
<p>The clear separation of SHAP values highlights XGBoost&#x2019;s capacity to integrate diverse clinical and molecular data, effectively capturing non-linear interactions between features. This ability to discern complex patterns and prioritize clinically meaningful variables is a key factor behind its high accuracy and generalization capability.</p>
<p>The Extra Trees model (<xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>) performed comparably to XGBoost, further validating the strength of ensemble-based models in this context. KPS once again emerged as the dominant feature, underscoring its central role in survival prediction. The model effectively leveraged other important variables, including age, radiotherapy, and MGMT promoter methylation, demonstrating its ability to capture both treatment-related and biological factors. The differentiation among SHAP values shows that the Extra Trees model can robustly identify key contributors to patient outcomes, even in the presence of class imbalance and data heterogeneity. This adaptability makes it a reliable tool for clinical decision support, particularly when high interpretability and predictive performance are required.</p>
<p>Overall, the SHAP analysis underscores the superiority of ensemble-based methods like XGBoost and Extra Trees over simpler models such as KNN. By effectively prioritizing clinically significant features and accommodating complex interactions, these models offer both high accuracy and interpretability&#x2014;critical components for integrating machine learning into personalized glioblastoma treatment strategies. Future enhancements, such as incorporating longitudinal patient data or multi-omics integration, could further improve their predictive capabilities, ensuring even greater clinical utility.</p>
</sec>
<sec id="s6" sec-type="discussion">
<label>6</label>
<title>Discussions</title>
<p>This study delves into the application of advanced ML techniques to predict OS in GBM patients, offering a comprehensive integration of clinical, molecular, and treatment-related data. Our findings illuminate the strengths of ML models in capturing the complexity of GBM prognosis while identifying key challenges and opportunities for refinement.</p>
<sec id="s6_1">
<label>6.1</label>
<title>Model performance and insights</title>
<p>XGBoost emerged as the most robust model in our analysis, achieving a mean ROC-AUC of 0.90 and an accuracy of 0.73. Its ability to manage heterogeneous and non-linear data interactions aligns with its established success in oncology applications. Gradient boosting techniques, like XGBoost, have gained recognition for their versatility in handling high-dimensional datasets. For instance, studies integrating clinical, transcriptomic, and radiomic data have demonstrated XGBoost&#x2019;s capacity to identify nuanced survival patterns, underscoring its adaptability to complex datasets.</p>
<p>Other models, such as SVM and ensemble approaches like ET and RF, also demonstrated strong predictive power, with SVM achieving a mean ROC-AUC of 0.84 and ET at 0.82. SVM&#x2019;s ability to perform well on smaller datasets is particularly relevant given the limited cohort sizes often encountered in GBM research. Ensemble methods excel in feature prioritization, providing interpretable insights into key prognostic variables such as MGMT promoter methylation and KPS. These findings echo recent studies that highlight ensemble models as vital tools for identifying clinically actionable predictors in oncology.</p>
<p>Conversely, ANN and KNN showed limited predictive capacity. While ANNs have shown promise in larger datasets due to their ability to recognize intricate patterns, their performance can falter with smaller, imbalanced datasets like ours. KNN&#x2019;s relatively poor performance, with an accuracy of 0.54, suggests it may not be suitable for high-dimensional datasets with sparse or unevenly distributed features. These results align with existing literature emphasizing the limitations of these approaches in specific contexts.</p>
</sec>
<sec id="s6_2">
<label>6.2</label>
<title>Innovations and methodological contributions</title>
<p>A strength of this study lies in the rigorous preprocessing methods employed, including label encoding and MinMax scaling, which ensured uniformity across variables. By categorizing OS into distinct survival classes, the study enabled a more granular stratification of patients. This approach mirrors recent advancements in predictive oncology, where discrete outcome modeling enhances the precision of survival estimates.</p>
<p>The fine-tuning of hyperparameters across models further underscores the methodological rigor. For instance, optimizing parameters such as learning rates, tree depth, and kernel selection significantly improved model accuracy. These strategies are increasingly regarded as essential in developing reliable predictive frameworks, as evidenced in contemporary GBM prognosis research.</p>
</sec>
<sec id="s6_3">
<label>6.3</label>
<title>Challenges and limitations</title>
<p>Despite its strengths, the study faced challenges that are emblematic of GBM research. The dataset&#x2019;s class imbalance, particularly among long-term survival categories, limited the models&#x2019; ability to accurately predict outcomes for these underrepresented groups. Addressing this imbalance requires innovative solutions, such as synthetic data generation using techniques like GANs or oversampling methods like SMOTE. Recent studies utilizing synthetic data have shown promise in enriching underrepresented classes while preserving the underlying data distribution (<xref ref-type="bibr" rid="B19">19</xref>&#x2013;<xref ref-type="bibr" rid="B21">21</xref>).</p>
<p>Another hurdle is the interpretability of complex ML models. While algorithms like XGBoost deliver high accuracy, their &#x201c;black-box&#x201d; nature limits transparency, which can impede clinical adoption. Emerging tools such as SHAP and LIME offer potential solutions by elucidating feature contributions, enabling clinicians to trust and act upon model predictions. Incorporating these interpretability frameworks into future iterations of our models would bridge the gap between accuracy and usability.</p>
<p>Furthermore, the single-cohort nature of the dataset necessitates external validation to ensure generalizability. Multi-institutional collaborations and federated learning approaches, which allow for model training across decentralized datasets while preserving patient privacy, represent a promising avenue for addressing this limitation. Such methodologies have shown great potential in recent multi-center oncology studies.</p>
</sec>
<sec id="s6_4">
<label>6.4</label>
<title>Clinical implications and future directions</title>
<p>The findings of this study highlight the transformative potential of ML in GBM prognosis. Accurate survival predictions have profound implications for patient care, from guiding individualized treatment strategies to identifying candidates for experimental therapies and optimizing resource allocation. Models like XGBoost not only deliver precise predictions but also underscore the prognostic value of variables such as MGMT promoter methylation, age, and KPS, reinforcing their relevance in clinical decision-making.</p>
<p>Future research should explore integrating longitudinal data to enable dynamic survival predictions that evolve alongside patient trajectories. Incorporating multi-omics data, such as proteomic and epigenomic profiles, into ML pipelines could further refine prognostic accuracy. Hybrid models that balance the interpretability of simpler algorithms with the predictive power of advanced techniques like gradient boosting could offer the best of both worlds, ensuring both accuracy and clinical usability.</p>
</sec>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>RO: Data curation, Methodology, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. F-MB: Resources, Validation, Visualization, Writing &#x2013; original draft. AD: Formal Analysis, Project administration, Supervision, Writing &#x2013; original draft. CC: Methodology, Project administration, Software, Validation, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. R-AC-B: Investigation, Methodology, Software, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. MS: Conceptualization, Investigation, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. PR: Funding acquisition, Project administration, Supervision, Writing &#x2013; review &amp; editing. CT: Resources, Supervision, Visualization, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s9" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. Publication of this paper was supported by the University of Medicine and Pharmacy Carol Davila, through the institutional program Publish not Perish.</p>
</sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>More</surname> <given-names>S</given-names>
</name>
<name>
<surname>De Smet</surname> <given-names>F</given-names>
</name>
<name>
<surname>De Vleeschouwer</surname> <given-names>S</given-names>
</name>
<name>
<surname>Agostinis</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>Antioxidant network-based signatures cluster glioblastoma into distinct redox-resistant phenotypes</article-title>. <source>Front Immunol</source>. (<year>2024</year>) <volume>15</volume>:<elocation-id>1342977</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fimmu.2024.1342977</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tambi</surname> <given-names>R</given-names>
</name>
<name>
<surname>Zehra</surname> <given-names>B</given-names>
</name>
<name>
<surname>Vijayakumar</surname> <given-names>A</given-names>
</name>
<name>
<surname>Satsangi</surname> <given-names>D</given-names>
</name>
<name>
<surname>Uddin</surname> <given-names>M</given-names>
</name>
<name>
<surname>Berdiev</surname> <given-names>BK</given-names>
</name>
</person-group>. <article-title>Artificial intelligence and omics in Malignant gliomas</article-title>. <source>Physiol Genomics</source>. (<year>2024</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.1152/physiolgenomics.00011.2024</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhong</surname> <given-names>S</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>JX</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>ZP</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>YD</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>CW</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>D</given-names>
</name>
<etal/>
</person-group>. <article-title>Predicting glioblastoma molecular subtypes and prognosis with a multimodal model integrating convolutional neural network, radiomics, and semantics</article-title>. (<year>2022</year>) <volume>139</volume>(<issue>2</issue>):<page-range>305&#x2013;14</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3171/2022.10.JNS22801</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jov&#x10d;evska. Next Generation Sequencing</surname> <given-names>I</given-names>
</name>
</person-group>. <article-title>and machine learning technologies are painting the epigenetic portrait of glioblastoma</article-title>. <source>Front Oncol</source>. (<year>2020</year>) <volume>10</volume>:<elocation-id>798</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fonc.2020.00798</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karabacak</surname> <given-names>M</given-names>
</name>
<name>
<surname>Patil</surname> <given-names>S</given-names>
</name>
<name>
<surname>Gersey</surname> <given-names>ZC</given-names>
</name>
<name>
<surname>Komotar</surname> <given-names>RJ</given-names>
</name>
<name>
<surname>Margetis</surname> <given-names>K</given-names>
</name>
</person-group>. <article-title>Radiomics-based machine learning with natural gradient boosting for continuous survival prediction in glioblastoma</article-title>. <source>Cancers</source>. (<year>2024</year>) <volume>16</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/cancers16213614</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kwiatkowska-Miernik</surname> <given-names>A</given-names>
</name>
<name>
<surname>Wasilewski</surname> <given-names>PG</given-names>
</name>
<name>
<surname>Mruk</surname> <given-names>B</given-names>
</name>
<name>
<surname>Sklinda</surname> <given-names>K</given-names>
</name>
<name>
<surname>Bujko</surname> <given-names>M</given-names>
</name>
<name>
<surname>Walecki</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>Estimating progression-free survival in patients with primary high-grade glioma using machine learning</article-title>. <source>J Clin Med</source>. (<year>2024</year>) <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jcm13206172</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>de la Fuente</surname> <given-names>MI</given-names>
</name>
<name>
<surname>Touat</surname> <given-names>M</given-names>
</name>
<name>
<surname>van den Bent</surname> <given-names>MJ</given-names>
</name>
<name>
<surname>Preusser</surname> <given-names>M</given-names>
</name>
<name>
<surname>Peters</surname> <given-names>KB</given-names>
</name>
<name>
<surname>Young</surname> <given-names>RJ</given-names>
</name>
<etal/>
</person-group>. <article-title>The role of vorasidenib in the treatment of isocitrate dehydrogenase-mutant glioma</article-title>. <source>Neuro Oncol</source>. (<year>2024</year>) <volume>25</volume>:<fpage>noae259</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/neuonc/noae259</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huma</surname> <given-names>C</given-names>
</name>
<name>
<surname>Hawon</surname> <given-names>L</given-names>
</name>
<name>
<surname>Sarisha</surname> <given-names>J</given-names>
</name>
<name>
<surname>Erdal</surname> <given-names>T</given-names>
</name>
<name>
<surname>Kevin</surname> <given-names>C</given-names>
</name>
<name>
<surname>Valentina</surname> <given-names>KA</given-names>
</name>
</person-group>. <article-title>Advances in the field of developing biomarkers for re-irradiation: a how-to guide to small, powerful data sets and artificial intelligence</article-title>. <source>Expert Rev Precis Med Drug Dev</source>. (<year>2024</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.1080/23808993.2024.2325936</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karabacak</surname> <given-names>M</given-names>
</name>
<name>
<surname>Jagtiani</surname> <given-names>P</given-names>
</name>
<name>
<surname>Di</surname> <given-names>L</given-names>
</name>
<name>
<surname>Shah</surname> <given-names>AH</given-names>
</name>
<name>
<surname>Komotar</surname> <given-names>RJ</given-names>
</name>
</person-group>. <article-title>and K. Margetis. Advancing precision prognostication in neuro- oncology: Machine learning models for data-driven personalized survival predictions in IDH-wildtype glioblastoma</article-title>. <source>Neuro- Oncol Adv</source>. (<year>2024</year>) <volume>6</volume>:<elocation-id>vdae096</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/noajnl/vdae096</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aleid</surname> <given-names>AM</given-names>
</name>
<name>
<surname>Alrasheed</surname> <given-names>AS</given-names>
</name>
<name>
<surname>Aldanyowi</surname> <given-names>SN</given-names>
</name>
<name>
<surname>Almalki</surname> <given-names>SF</given-names>
</name>
</person-group>. <article-title>Advanced magnetic resonance imaging for glioblastoma: Oncology-radiology integration</article-title>. <source>Surg Neurol Int</source>. (<year>2024</year>) <volume>vol</volume>:<fpage>309</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.25259/SNI_498_2024</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duman</surname> <given-names>A</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>X</given-names>
</name>
<name>
<surname>Thomas</surname> <given-names>S</given-names>
</name>
<name>
<surname>Powell</surname> <given-names>JR</given-names>
</name>
<name>
<surname>Spezi</surname> <given-names>E</given-names>
</name>
</person-group>. <article-title>Reproducible and interpretable machine learning-based radiomic analysis for overall survival prediction in glioblastoma multiforme</article-title>. <source>Cancers</source>. (<year>2024</year>) <volume>16</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/cancers16193351</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gatto</surname> <given-names>L</given-names>
</name>
<name>
<surname>Franceschi</surname> <given-names>E</given-names>
</name>
<name>
<surname>Tosoni</surname> <given-names>A</given-names>
</name>
<name>
<surname>Di Nunno</surname> <given-names>V</given-names>
</name>
<name>
<surname>Tonon</surname> <given-names>C</given-names>
</name>
<name>
<surname>Lodi</surname> <given-names>R</given-names>
</name>
<etal/>
</person-group>. <article-title>Beyond imaging and genetic signature in glioblastoma: radiogenomic holistic approach in neuro-oncology</article-title>. <source>Biomedicines</source>. (<year>2022</year>) <volume>10</volume>:<elocation-id>3205</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/biomedicines10123205</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>H</given-names>
</name>
<name>
<surname>Dohopolski</surname> <given-names>M</given-names>
</name>
<name>
<surname>Stojadinovic</surname> <given-names>S</given-names>
</name>
<name>
<surname>Schmitt</surname> <given-names>LG</given-names>
</name>
<name>
<surname>Anand</surname> <given-names>S</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>H</given-names>
</name>
<etal/>
</person-group>. <article-title>Multiomics-based outcome prediction in personalized ultra-fractionated stereotactic adaptive radiotherapy (PULSAR)</article-title>. <source>Cancers</source>. (<year>2024</year>) <volume>16</volume>(<issue>19</issue>):<elocation-id>3425</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/cancers16193425</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname> <given-names>L</given-names>
</name>
<etal/>
</person-group>. <article-title>Artificial intelligence (AI) and machine learning (ML) in precision oncology: a review on enhancing discoverability through multiomics integration</article-title>. <source>Br J Radiol</source>. (<year>2023</year>) <volume>96</volume>:<elocation-id>20230211</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1259/bjr.20230211</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khanna</surname> <given-names>VV</given-names>
</name>
<name>
<surname>Chadaga</surname> <given-names>K</given-names>
</name>
<name>
<surname>Sampathila</surname> <given-names>N</given-names>
</name>
<name>
<surname>Prabhu</surname> <given-names>S</given-names>
</name>
</person-group>. <article-title>A machine learning and explainable artificial intelligence triage-prediction system for COVID-19</article-title>. <source>Decis Anal J</source>. (<year>2023</year>) <volume>7</volume>:<elocation-id>100246</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.dajour.2023.100246</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rashmi</surname> <given-names>U</given-names>
</name>
<name>
<surname>Beena</surname> <given-names>BM</given-names>
</name>
</person-group>. <article-title>Multiple machine learning models for Alzheimer&#x2019;s disease detection for mixed data with explainable AI. In: 2024 15th International Conference on Computing Communication and Networking Technologies (ICCCNT)</article-title>. <publisher-loc>Kamand, India</publisher-loc>: <publisher-name>IEEE</publisher-name>. (<year>2024</year>). p. <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCCNT61001.2024.10725637</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>D</given-names>
</name>
<name>
<surname>Gong</surname> <given-names>L</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>C</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>An explainable machine learning-based model to predict intensive care unit admission among patients with community-acquired pneumonia and connective tissue disease</article-title>. <source>Respir Res</source>. (<year>2024</year>) <volume>25</volume>:<fpage>246</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12931-024-02874-3</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Palkar</surname> <given-names>A</given-names>
</name>
<name>
<surname>Dias</surname> <given-names>CC</given-names>
</name>
<name>
<surname>Chadaga</surname> <given-names>K</given-names>
</name>
<name>
<surname>Sampathila</surname> <given-names>N</given-names>
</name>
</person-group>. <article-title>Empowering glioma prognosis with transparent machine learning and interpretative insights using explainable AI</article-title>. <source>IEEE Access</source>. (<year>2024</year>) <volume>12</volume>:<page-range>31697&#x2013;718</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2024.3370238</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Medvedieva</surname> <given-names>K</given-names>
</name>
<name>
<surname>Tosi</surname> <given-names>T</given-names>
</name>
<name>
<surname>Barbierato</surname> <given-names>E</given-names>
</name>
<name>
<surname>Gatti</surname> <given-names>A</given-names>
</name>
</person-group>. <article-title>Balancing the scale: data augmentation techniques for improved supervised learning in cyberattack detection</article-title>. <source>Eng</source>. (<year>2024</year>) <volume>5</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/eng5030114</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pandey</surname> <given-names>A</given-names>
</name>
<name>
<surname>Shivaji</surname> <given-names>BA</given-names>
</name>
<name>
<surname>Acharya</surname> <given-names>M</given-names>
</name>
<name>
<surname>Mohbey</surname> <given-names>KK</given-names>
</name>
</person-group>. <article-title>Mitigating class imbalance in heart disease detection with machine learning</article-title>. <source>Multimed Tools Appl</source>. (<year>2024</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-024-19705-8</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hosain</surname> <given-names>T</given-names>
</name>
<name>
<surname>Jim</surname> <given-names>JR</given-names>
</name>
<name>
<surname>Mridha</surname> <given-names>MF</given-names>
</name>
<name>
<surname>Kabir</surname> <given-names>MM</given-names>
</name>
</person-group>. <article-title>Explainable AI approaches in deep learning: Advancements, applications and challenges</article-title>. <source>Comput Electr Eng</source>. (<year>2024</year>) <volume>117</volume>:<elocation-id>109246</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compeleceng.2024.109246</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>