<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2025.1733180</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>ESA-YOLOv5m: a lightweight spatial and improved attention-driven detection for brain tumor MRI analysis</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Almufareh</surname> <given-names>Maram Fahaad</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Tariq</surname> <given-names>Noshina</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3310569"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Humayun</surname> <given-names>Mamoona</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<uri xlink:href="https://loop.frontiersin.org/people/2187125"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Aldossary</surname> <given-names>Haya</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<uri xlink:href="https://loop.frontiersin.org/people/2931625"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Alharbi</surname> <given-names>Meshal</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Information Systems, College of Computer and Information Sciences, Jouf University</institution>, <city>Sakaka</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Artificial Intelligence and Data Science, National University of Computer and Emerging Sciences</institution>, <city>Islamabad</city>, <country country="pk">Pakistan</country></aff>
<aff id="aff3"><label>3</label><institution>School of Computing, Engineering and the Built Environment, University of Roehampton</institution>, <city>London</city>, <country country="gb">United Kingdom</country></aff>
<aff id="aff4"><label>4</label><institution>Computer Science Department, College of Science and Humanities, Imam Abdulrahman Bin Faisal University</institution>, <city>Al Jubail</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Computer Science, College of Computer Engineering and Sciences, Prince Sattam Bin Abdulaziz University</institution>, <city>Al-Kharj</city>, <country country="sa">Saudi Arabia</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Maram Fahaad Almufareh, <email xlink:href="mailto:mfalmufareh@ju.edu.sa">mfalmufareh@ju.edu.sa</email>; Mamoona Humayun, <email xlink:href="mailto:mamoona.humayun@roehampton.ac.uk">mamoona.humayun@roehampton.ac.uk</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-12-19">
<day>19</day>
<month>12</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1733180</elocation-id>
<history>
<date date-type="received">
<day>27</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>21</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>11</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Almufareh, Tariq, Humayun, Aldossary and Alharbi.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Almufareh, Tariq, Humayun, Aldossary and Alharbi</copyright-holder>
<license>
<ali:license_ref start_date="2025-12-19">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The early and accurate detection of brain tumors is vital for improving patient outcomes, enabling timely clinical interventions, and reducing diagnostic uncertainty. Despite advances in deep learning, conventional Convolutional Neural Network (CNN)-based models often struggle with small or low-contrast tumors. They also remain computationally demanding for real-time clinical deployment.</p></sec>
<sec>
<title>Methods</title>
<p>This study presents an Enhanced Spatial Attention (ESA)-integrated You Only Look Once v5 medium (YOLOv5m) architecture, a lightweight and efficient framework for brain tumor detection in MRI scans. The ESA module, positioned after the Spatial Pyramid Pooling-Fast (SPPF) layer, enhances feature discrimination by emphasizing diagnostically relevant regions while suppressing background noise, thereby improving localization accuracy without increasing computational complexity. Experiments were conducted on the Figshare brain tumor MRI dataset containing three tumor classes: glioma, meningioma, and pituitary.</p></sec>
<sec>
<title>Results</title>
<p>ESA-YOLOv5m achieved a Precision of 90%, Recall of 90%, and mean Average Precision (mAP)&#x00040;0.5 of 91%, surpassing the baseline YOLOv5m by approximately 11%&#x02013;12%. An ablation study further confirmed that placing the ESA module after the SPPF layer yields the highest performance (mAP&#x00040;0.5 = 0.91), while earlier integration produced marginally lower results. Classwise analyses demonstrated consistent gains (mAP range 0.87&#x02013;0.98), and fivefold cross-validation showed stable performance (mAP&#x00040;0.5 = 0.910 &#x000B1; 0.006). Efficiency tests revealed negligible overhead, with less than a 4.3% increase in parameters and an average latency below 10 ms per image.</p></sec>
<sec>
<title>Discussion</title>
<p>Overall, the results validate that integrating a lightweight spatial attention mechanism significantly enhances tumor localization and model generalization while preserving real-time inference. The proposed ESA-YOLOv5m framework provides a reliable and scalable solution for automated brain tumor detection, suitable for clinical decision-support systems and edge healthcare applications.</p></sec></abstract>
<kwd-group>
<kwd>YOLOv5m</kwd>
<kwd>Enhanced Spatial Attention (ESA)</kwd>
<kwd>brain tumor detection</kwd>
<kwd>medical imaging</kwd>
<kwd>deep learning</kwd>
<kwd>precision and recall</kwd>
<kwd>mAP</kwd>
<kwd>Figshare MRI dataset</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>Al Jouf University</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100007614</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp1">DGSSR-2025-FC-01029</award-id>
</award-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The authors gratefully acknowledge the financial support provided by the Deanship of Graduate Studies and Scientific Research at Jouf University for funding this work.</funding-statement>
</funding-group>
<counts>
<fig-count count="12"/>
<table-count count="10"/>
<equation-count count="11"/>
<ref-count count="29"/>
<page-count count="15"/>
<word-count count="9233"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Precision Medicine</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="introduction" id="s1">
<label>1</label>
<title>Introduction</title>
<p>The integration of artificial intelligence (AI) and Deep Learning (DL) into medical imaging has transformed disease diagnosis, enabling rapid and accurate clinical assessment (<xref ref-type="bibr" rid="B1">1</xref>). Among the various applications, brain tumor detection has gained particular importance, as early and reliable diagnosis directly influences patient survival and treatment outcomes. Brain tumors, whether benign or malignant, remain one of the most critical neurological conditions, often requiring timely surgical or therapeutic intervention (<xref ref-type="bibr" rid="B2">2</xref>). Magnetic Resonance Imaging (MRI) continues to be the preferred non-invasive modality for brain tumor analysis. It provides high soft-tissue contrast and detailed anatomical representation (<xref ref-type="bibr" rid="B3">3</xref>). However, manual interpretation of MRI scans is labor-intensive and subjective. It is also prone to inter-observer variability, especially in cases involving small, irregular, or low-contrast lesions (<xref ref-type="bibr" rid="B4">4</xref>). These limitations have motivated the development of automated and intelligent diagnostic system. Such systems are capable of assisting radiologists in precise tumor localization and classification.</p>
<p>Brain tumors are a serious health risk to the human body, and when they are not identified in time, they cause irreparable neurological conditions or even death. Early and accurate diagnosis is thus not only preferable but also required to enhance patients&#x00027; survival rates and to provide them with timely intervention, which is less invasive. This clinical urgency supports the fact that powerful Computer-Aided Diagnosis (CAD) systems are necessary to help radiologists quickly and effectively detect tumors. Early CAD frameworks relied on handcrafted features and classical machine learning algorithms, which were insufficient to capture the heterogeneous appearance of brain tumors across different patients and imaging conditions (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>). DL, particularly CNNs, has since become the dominant approach in medical image analysis due to its ability to learn hierarchical feature representations directly from raw data (<xref ref-type="bibr" rid="B7">7</xref>). In this paradigm, object detection architectures such as You Only Look Once (YOLO) have shown exceptional efficiency for real-time medical image interpretation because of their end-to-end design and low inference latency (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B9">9</xref>).</p>
<p>Nevertheless, standard CNN-based detectors encounter persistent challenges when applied to medical imaging (<xref ref-type="bibr" rid="B10">10</xref>). Tumors vary significantly in size, texture, and intensity, and are often embedded within complex anatomical structures (<xref ref-type="bibr" rid="B11">11</xref>). These characteristics make small tumor regions difficult to detect, frequently leading to false negatives (<xref ref-type="bibr" rid="B12">12</xref>, <xref ref-type="bibr" rid="B13">13</xref>). Additionally, existing high-performance CNNs are computationally demanding, limiting their adoption in clinical environments or on edge devices where real-time processing is essential. To overcome these limitations, recent studies have explored hybrid and attention-enhanced frameworks (<xref ref-type="bibr" rid="B14">14</xref>), yet many of these remain computationally heavy and lack consistent generalization across tumor types.</p>
<p>To address these challenges, this study proposes an Enhanced Spatial Attention (ESA)-integrated YOLOv5m model that enhances spatial focus during tumor localization. The ESA module is inserted after the Spatial Pyramid Pooling-Fast (SPPF) layer, enabling the network to prioritize diagnostically important regions while suppressing irrelevant background. This design strengthens the model&#x00027;s sensitivity to small and low-contrast tumors without introducing significant computational overhead. The lightweight structure of ESA-YOLOv5m ensures that it remains deployable in real-time and resource-constrained healthcare settings. In Clinical practice, supplementary spatial attention aids physicians by automatically highlighting critical tumor margins and suppressing distracting structures in phantoms. This not only lessens time spent on delineating lesions, but also alleviates inter-observer variability in contouring difficult or low contrast lesions. This enables radiologists to concentrate on the areas the algorithms identify and subsequently verify, thus reducing the time taken to report while also reducing uncertainty in diagnosis during the screening or follow-up examinations. A comprehensive series of experiments was performed on the Figshare brain tumor MRI dataset, comprising three tumor types: glioma, meningioma, and pituitary. Comparative evaluations with the baseline YOLOv5m demonstrated clear improvements in Precision, Recall, and mean Average Precision at an IoU threshold of 0.5 [mean Average Precision (mAP)&#x00040;0.5]. Furthermore, an ablation study confirmed that the optimal performance was achieved when ESA was positioned after the SPPF layer, leading to the highest detection accuracy (mAP&#x00040;0.5 = 0.91) while maintaining low latency. Class-wise analysis indicated consistent gains across all tumor categories, and five-fold cross-validation established the model&#x00027;s stability and reproducibility. The main contributions of this study are summarized as follows:</p>
<list list-type="order">
<list-item><p>The backbone of YOLOv5m is modified via the Enhanced Spatial Attention (ESA) module to increase localization precision and detection of small or low-contrast tumors in MRI images.</p></list-item>
<list-item><p>Extensive ablation experiments show that the best generalization and accuracy at the lowest extra cost of computation occurs with ESA placement after the SPPF layer.</p></list-item>
<list-item><p>ESA-YOLOv5m maintains the ability to perform real-time inferences and has higher Precision, Recall, and mAP&#x00040;0.5 values than the baseline YOLOv5m.</p></list-item>
<list-item><p>The robustness and stability of the model across diverse tumor classes are shown through comprehensive evaluations, including class-wise and five-fold cross-validation analyses.</p></list-item>
<list-item><p>ESA-YOLOv5m&#x00027;s ultralightweight design and high efficiency allow it to be used in direct clinical applications and deployed on edge computing medical devices.</p></list-item>
</list>
<sec>
<label>1.1</label>
<title>Paper organization</title>
<p>The remainder of this paper is structured as follows: Section 2 sheds light on related work, Section 4 presents the methodology, including data preprocessing, model architecture, and training strategy. Section 5 describes the experimental setup and hyperparameters. Section 6 provides the results and detailed discussion of the model&#x00027;s performance. Finally, Section 7 concludes the paper and discusses future research directions.</p></sec></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>Automated brain tumor detection on MRI has attracted growing attention in recent years, driven by the rapid progress of deep learning and computational imaging. Early computer-aided diagnosis (CAD) approaches primarily relied on handcrafted features and classical machine learning models such as support vector machines (SVMs) and random forests. Although these models offered interpretability, they struggled to generalize across datasets with varying imaging parameters and tumor morphologies. For instance, Hussain et al. (<xref ref-type="bibr" rid="B15">15</xref>) employed feature-based classifiers for tumor recognition but reported limited robustness under different scanner settings. Later, Muhammad et al. (<xref ref-type="bibr" rid="B16">16</xref>) conducted a comprehensive survey confirming that convolutional neural networks (CNNs) outperform traditional techniques in most MRI-based tumor classification tasks. Similarly, Yildirim et al. (<xref ref-type="bibr" rid="B17">17</xref>) achieved per-class precision between 0.82 and 0.93 using a ResNet-based hybrid CNN model for glioma, meningioma, and pituitary tumor classification, highlighting the advantages of deep hierarchical features over manually engineered descriptors.</p>
<p>Despite these improvements, CNN-based models often face challenges in accurately identifying small or low-contrast tumors, which frequently leads to false negatives (<xref ref-type="bibr" rid="B7">7</xref>). Moreover, large network architectures are computationally intensive, restricting their use in real-time or resource-constrained environments such as clinical workstations and edge healthcare systems. To overcome these limitations, attention and transformer-based mechanisms have gained popularity for enhancing spatial awareness and model interpretability (<xref ref-type="bibr" rid="B13">13</xref>).</p>
<p>Recent studies have explored lightweight attention-driven architectures to refine feature representation without incurring excessive computational costs. Hekmat et al. (<xref ref-type="bibr" rid="B18">18</xref>) introduced an Attention-Fused MobileNet-LSTM framework that combines convolutional and recurrent layers to capture both spatial and temporal dependencies in MRI sequences, achieving 98.66% accuracy. Although their model demonstrated strong interpretability through Grad-CAM visualization, the LSTM component increased training complexity and reduced transparency in feature extraction. Similarly, Dutta et al. (<xref ref-type="bibr" rid="B19">19</xref>) proposed ARM-Net, an attention-guided residual multiscale CNN that emphasized class-specific tumor features, improving multi-class accuracy on the Figshare dataset but at the expense of greater computational demand.</p>
<p>Vision transformer (ViT) models have also emerged as powerful alternatives by modeling long-range spatial dependencies. Poornam and Angelina (<xref ref-type="bibr" rid="B20">20</xref>) proposed the VITALT model, which combines self-attention with linear transformations to capture both local and global MRI features. Although effective, transformer-based methods are computationally expensive and may not be suitable for real-time deployment. Complementary approaches have incorporated hybrid attention mechanisms; for instance, Saeed et al. (<xref ref-type="bibr" rid="B21">21</xref>) developed GGLA-NeXtE2NET, a dual-branch ensemble integrating gated global-local attention with EfficientNet and ConvNeXt backbones, achieving over 99% accuracy in multi-class classification. While such models capture multiscale contextual information, their heavy architecture and reliance on GAN-based augmentation (ESRGAN) limit deployment feasibility.</p>
<p>In parallel, studies employing object detection frameworks such as the YOLO family have demonstrated promising results in localizing tumors directly on MRI images. YOLO-based architectures have been adapted for real-time detection of glioma, meningioma, and pituitary lesions, offering an optimal trade-off between accuracy and inference speed (<xref ref-type="bibr" rid="B9">9</xref>). As illustrated in Rastogi et al. (<xref ref-type="bibr" rid="B2">2</xref>), the combination of transfer learning and fine-tuning techniques improved precision in detection even in the absence of large datasets. This is very pertinent as new hybrid architectures with certain secure and privacy-protected modules (<xref ref-type="bibr" rid="B14">14</xref>) emphasize the increasing demand for clinically relevant and low-computation systems in distributed medical settings.</p>
<p><xref ref-type="table" rid="T1">Table 1</xref> summarizes representative studies on brain tumor detection and classification, outlining their core methodologies, attention mechanisms, performance metrics, and limitations. Although recent models have achieved high accuracy, most remain computationally demanding or lack robustness when detecting small and irregular tumor regions. Existing research demonstrates the rapid evolution from classical handcrafted-feature methods to deep attention-based and transformer-driven architectures for brain tumor analysis. While transformer and ensemble approaches (<xref ref-type="bibr" rid="B20">20</xref>, <xref ref-type="bibr" rid="B21">21</xref>) achieve excellent accuracy, they typically require high computational power and large annotated datasets. Hybrid CNN-LSTM and multiscale networks (<xref ref-type="bibr" rid="B18">18</xref>, <xref ref-type="bibr" rid="B19">19</xref>) improve interpretability but remain unsuitable for real-time inference due to their complexity. Similarly, YOLO-based detectors (<xref ref-type="bibr" rid="B9">9</xref>) achieve good speed-accuracy trade-offs but still struggle with small or low-contrast tumors. The proposed ESA-YOLOv5m model integrates Enhanced Spatial Attention (ESA) mechanisms into the architecture of YOLOv5m. This design maintains model efficiency while also increasing the model&#x00027;s sensitivity to small and diffuse regions of the tumor. Strong ablation studies have shown that the ideal position for the ESA block is after the SPPF layer. This achieves the best balance of the model accuracy, interpretability, and real-time applicability. As a result, ESA-YOLOv5m is a robust, scalable model that applies to practical conditions, effectively addressing the performance-efficiency gap identified in the current state-of-the-art approaches.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Comparison of recent brain tumor detection and classification studies (2019&#x02013;2025).</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>References</bold></th>
<th valign="top" align="left"><bold>Architecture</bold></th>
<th valign="top" align="left"><bold>Attention/mechanism</bold></th>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Accuracy/mAP</bold></th>
<th valign="top" align="left"><bold>Key limitations</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Hussain et al. (<xref ref-type="bibr" rid="B15">15</xref>)</td>
<td valign="top" align="left">Feature-based ML (SVM, RF)</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">Private MRI</td>
<td valign="top" align="left">89.4% (accuracy)</td>
<td valign="top" align="left">Limited generalization and noise sensitivity</td>
</tr>
<tr>
<td valign="top" align="left">Yildirim et al. (<xref ref-type="bibr" rid="B17">17</xref>)</td>
<td valign="top" align="left">ResNet-based hybrid CNN</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">Figshare MRI</td>
<td valign="top" align="left">82%&#x02013;93% (precision)</td>
<td valign="top" align="left">Weak detection of small tumors</td>
</tr>
<tr>
<td valign="top" align="left">Hekmat et al. (<xref ref-type="bibr" rid="B18">18</xref>)</td>
<td valign="top" align="left">MobileNet-LSTM hybrid</td>
<td valign="top" align="left">Attention fusion &#x0002B; Grad-CAM</td>
<td valign="top" align="left">Public MRI</td>
<td valign="top" align="left">98.66% (accuracy)</td>
<td valign="top" align="left">High complexity, limited generalization</td>
</tr>
<tr>
<td valign="top" align="left">Dutta et al. (<xref ref-type="bibr" rid="B19">19</xref>)</td>
<td valign="top" align="left">ARM-Net (residual multiscale CNN)</td>
<td valign="top" align="left">Channel-spatial attention</td>
<td valign="top" align="left">Figshare MRI</td>
<td valign="top" align="left">91.5% (accuracy)</td>
<td valign="top" align="left">Heavy model, lacks real-time feasibility</td>
</tr>
<tr>
<td valign="top" align="left">Poornam and Angelina (<xref ref-type="bibr" rid="B20">20</xref>)</td>
<td valign="top" align="left">Vision transformer (VITALT)</td>
<td valign="top" align="left">Self-attention (transformer blocks)</td>
<td valign="top" align="left">4-class MRI</td>
<td valign="top" align="left">96.2% (accuracy)</td>
<td valign="top" align="left">Computationally intensive, not real-time</td>
</tr>
<tr>
<td valign="top" align="left">Saeed et al. (<xref ref-type="bibr" rid="B21">21</xref>)</td>
<td valign="top" align="left">GGLA-NeXtE2NET (ensemble)</td>
<td valign="top" align="left">Gated global-local attention</td>
<td valign="top" align="left">4-class MRI</td>
<td valign="top" align="left">99.0% (accuracy)</td>
<td valign="top" align="left">High memory demand, complex training</td>
</tr>
<tr>
<td valign="top" align="left">Rastogi et al. (<xref ref-type="bibr" rid="B2">2</xref>)</td>
<td valign="top" align="left">Fine-tuned CNN (transfer learning)</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">Kaggle MRI</td>
<td valign="top" align="left">97.3% (accuracy)</td>
<td valign="top" align="left">Limited to image-level classification</td>
</tr>
<tr>
<td valign="top" align="left">Saranya and Praveena (<xref ref-type="bibr" rid="B9">9</xref>)</td>
<td valign="top" align="left">YOLOv5/YOLOv7 detectors</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">MRI (3-class)</td>
<td valign="top" align="left">mAP&#x00040;0.5 &#x02248; 0.90</td>
<td valign="top" align="left">Recall drops for diffuse tumors</td>
</tr>
<tr>
<td valign="top" align="left">This study</td>
<td valign="top" align="left">YOLOv5m &#x0002B; ESA module</td>
<td valign="top" align="left">Enhanced Spatial Attention (ESA)</td>
<td valign="top" align="left">Figshare MRI</td>
<td valign="top" align="left">mAP&#x00040;0.5 = 0.91, Precision = 0.90, Recall = 0.90</td>
<td valign="top" align="left">Lightweight, real-time, effective on small lesions</td>
</tr></tbody>
</table>
</table-wrap>
<p>More recent developments in the YOLO family, including v3, v4, v5, v6, v7, and v8, have brought about the use of anchor-free detection heads and decoupled classification-regression branches as well as enhanced feature-pyramid aggregation. A number of medical-imaging works have used these variations on domain-specific tasks, including the tumor grading task (<xref ref-type="bibr" rid="B22">22</xref>&#x02013;<xref ref-type="bibr" rid="B24">24</xref>), the task of occupational safety (<xref ref-type="bibr" rid="B22">22</xref>, <xref ref-type="bibr" rid="B25">25</xref>), and the task of human-falls detection (<xref ref-type="bibr" rid="B26">26</xref>, <xref ref-type="bibr" rid="B27">27</xref>). Nonetheless, the methods place significant emphasis on accuracy and minimal focus on interpretability and computation efficiency, which are important in clinical translation. Our ESA-YOLOv5m helps overcome these flaws with a lightweight attention-based on the improvement of the spatial focus without raising the inference latency. As discussed earlier, previous research illustrates architectural evolution. However, there is a gap in primary research that relates to lightweight spatial-attention mechanisms designed for medical YOLO models. Most existing transformer and hybrid models focus on accuracy at the expense of practical deployment, and earlier YOLO versions forgo detailed spatial refinement. The ESA-YOLOv5m aims at closing these gaps by introducing an attention block that is interpretable and resource-efficient, and that improves tumor localization in real-time.</p></sec>
<sec id="s3">
<label>3</label>
<title>Proposed model architecture</title>
<p>The proposed model aims to detect brain tumors from MRI slices with greater accuracy and efficiency than existing methods, with the integration of Enhanced Spatial Attention to the current YOLOv5m configuration, as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. This architecture aims to fast extract &#x0201C;meta&#x0201D; features while performing &#x0201C;spatial&#x0201D; attention refinement to maximize attention &#x0201C;focus&#x0201D; (localization) and minimize &#x0201C;time&#x0201D; (latency) to soothingly generalize (low) MRI dataset variability (across). While the latest detectors YOLOv8 and YOLOv9 boast near parity improvements in accuracy, for the purpose of this study, YOLOv5m is chosen as the backbone due to its established reliability, clinical ubiquity, and seamless ONNX/TensorRT deployment pipeline integration. The relative configurability and optimization treatable by public APIs are essential for reproducibility and straightforward integration to HIT systems in a hospital. The added benefit of dependable streamlining and long-term maintenance, which in this case are in lieu of accuracy, is an important part of the overall system.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>ESA-YOLOv5m architecture with the ESA block integrated to enhance spatial feature refinement for brain tumor localization.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0001.tif">
<alt-text content-type="machine-generated">Flowchart of an enhanced brain tumor detection model architecture, detailing six stages: YOLOv5 Network Integration with ESA layer, SPPF Layer Processing of feature maps, Feature Map Refinement for spatial details, ESA Layer Integration for spatial refinement, Attention Mechanism for focus, and Improved Localization and Recognition for performance. Each stage is depicted with icons and descriptions.</alt-text>
</graphic>
</fig>
<sec>
<label>3.1</label>
<title>Overview of the architecture</title>
<p>The overall structure of the ESA-YOLOv5m model is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. It builds upon the YOLOv5m backbone, with the ESA module placed immediately after the Spatial Pyramid Pooling-Fast (SPPF) layer. This placement enhances the spatial discrimination capability of the network by amplifying tumor-relevant regions and suppressing non-informative background details. The model pipeline consists of four functional stages: feature extraction, spatial refinement, multi-scale feature fusion, and prediction. This configuration enables the model to efficiently detect tumors of varying sizes and contrast levels while maintaining real-time performance.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Proposed YOLOv5m &#x0002B; ESA architecture with ESA integrated after the SPPF block to enhance spatial attention and refine tumor features.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0002.tif">
<alt-text content-type="machine-generated">A flowchart depicting a neural network architecture. The backbone consists of five convolution blocks (P1 to P5) leading to spatial pyramid pooling and enhanced spatial attention. The head comprises C3 blocks and subsequent upsampling, concatenation, and output processes. The chart uses color coding: yellow for convolution blocks, green for C3 blocks, blue circles for concatenation, and red squares for upsampling.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>3.2</label>
<title>Input and preprocessing</title>
<p>The model receives T1-weighted contrast-enhanced MRI slices as input. Preprocessing involves intensity normalization, resizing to a fixed input resolution, and data augmentation operations such as flipping, rotation, HSV shifting, mosaic, and mixup. These augmentations improve generalization and robustness without distorting anatomical structures. Each image is resized to 832 &#x000D7; 832 pixels to balance fine-detail preservation and memory efficiency.</p></sec>
<sec>
<label>3.3</label>
<title>Backbone and ESA integration</title>
<p>The structure&#x00027;s backbones are built off of the hierarchical, low- and high-level feature maps of the CSP-Darknet53 used in YOLOv5m. The Enhanced Spatial Attention module is added directly after the SPPF, which is where the features are pooled and where this integration refinement takes place. The refinement focuses on the activation maps of the highly background-attenuated regions by boosting the tumor-specific activations. The attention mechanism improves the feature representation and, at the same time, improves the network&#x00027;s sensitivity to smaller and low-contrast tumor features with marginal added computation. The traditional attention designs, for example, Squeeze-and-Excitation (SE) (<xref ref-type="bibr" rid="B28">28</xref>) block and the Convolutional Block Attention Module (CBAM) (<xref ref-type="bibr" rid="B29">29</xref>) improve the feature representation with channel or sequential channel-spatial processing. Nevertheless, SE considers only the channel recalibration and disregards spatial relevance, whereas CBAM uses attention in order to result in more computations. Conversely, the suggested ESA introduces adaptive spatial weighting in a non-iterative manner by multi-scale convolutions, which enables the network to maintain local and global dependencies without many extra parameters. The design results in the localization of tumor boundaries more sharply, and the interpretation is better within the restrictions of real-time. In <xref ref-type="table" rid="T2">Table 2</xref>, the proposed ESA is distinguished by conventional attention modules at multiple scales by considering only spatial relationships with multi-scale convolutional fusion, which can localize successfully with limited computational cost.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Comparison of ESA with common attention modules.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Module</bold></th>
<th valign="top" align="left"><bold>Attention type</bold></th>
<th valign="top" align="left"><bold>Computation</bold></th>
<th valign="top" align="left"><bold>Focus</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">SE Block</td>
<td valign="top" align="left">Channel-only</td>
<td valign="top" align="left">Low</td>
<td valign="top" align="left">Global intensity</td>
</tr>
<tr>
<td valign="top" align="left">CBAM</td>
<td valign="top" align="left">Channel &#x0002B; Spatial (sequential)</td>
<td valign="top" align="left">Moderate</td>
<td valign="top" align="left">Combined saliency</td>
</tr>
<tr>
<td valign="top" align="left"><bold>ESA (Proposed)</bold></td>
<td valign="top" align="left">Spatial (parallel multi-scale)</td>
<td valign="top" align="left">Low</td>
<td valign="top" align="left">Localized structure</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>3.4</label>
<title>Multi-scale feature fusion and detection head</title>
<p>Following the ESA-enhanced backbone, the neck uses a Cross Stage Partial Path Aggregation Network (CSP-PAN) to merge features from multiple scales. This ensures that both global context and localized features are retained, allowing accurate detection of tumors regardless of their size or location. The detection head, identical to YOLOv5m&#x00027;s standard head, outputs bounding boxes, class probabilities, and confidence scores for the three tumor categories: glioma, meningioma, and pituitary. The ESA-refined features contribute to tighter bounding boxes and improved confidence in tumor localization.</p></sec>
<sec>
<label>3.5</label>
<title>Loss function and optimization</title>
<p>The model is trained end-to-end using a composite loss function that combines classification, objectness, and localization terms. The total loss is defined in <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref>:</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">cls</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">cls</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">obj</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">obj</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">loc</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">CIoU</mml:mtext></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref>, <italic>L</italic><sub>cls</sub> represents the classification loss, <italic>L</italic><sub>obj</sub> is the objectness loss, and <italic>L</italic><sub>CIoU</sub> denotes the Complete IoU loss. The weighting coefficients &#x003BB;<sub>cls</sub>, &#x003BB;<sub>obj</sub>, and &#x003BB;<sub>loc</sub> balance the contributions of these components. The inclusion of the Complete IoU term improves the geometric alignment between predicted and ground truth bounding boxes, leading to more accurate tumor localization.</p>
<p>The entire network is optimized using the loss function described in <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref>. Training employs an AdamW optimizer with cosine learning rate scheduling and standard YOLOv5 augmentation strategies. This configuration ensures efficient convergence and balanced optimization across all detection components. By integrating the ESA module into the YOLOv5m backbone, the proposed model achieves refined spatial attention with negligible computational overhead. The attention mechanism enhances the focus on diagnostically significant tumor areas, thereby improving detection precision and recall. The improved feature representation also reduces false positives and accelerates training convergence. Overall, the ESA-YOLOv5m framework maintains the lightweight and real-time characteristics of YOLOv5m while delivering substantial gains in spatial accuracy and interpretability, making it suitable for deployment in clinical diagnostic systems.</p></sec></sec>
<sec id="s4">
<label>4</label>
<title>Methodology</title>
<p>The proposed YOLOv5m &#x0002B; ESA framework introduces a lightweight and attention-optimized deep learning pipeline for automated brain tumor detection from MRI scans. The methodology integrates data preprocessing, architecture enhancement, model training, and evaluation into a single end-to-end pipeline. Unlike conventional CNN-based detectors that rely solely on convolutional features, the proposed model incorporates an Enhanced Spatial Attention (ESA) module to improve tumor localization accuracy, particularly for small and low-contrast regions. The overall workflow is formulated mathematically and supported by visual examples of dataset preparation and annotation.</p>
<sec>
<label>4.1</label>
<title>The workflow of the proposed methodology</title>
<p>Let the MRI dataset be defined as <inline-formula><mml:math id="M2"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>D</mml:mi></mml:mstyle></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, where each image <italic>I</italic><sub><italic>i</italic></sub> contains annotated tumor regions. Preprocessing transforms each raw image through normalization, resizing, and augmentation operations, collectively represented by <inline-formula><mml:math id="M3"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>T</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> in <xref ref-type="disp-formula" rid="EQ2">Equation 2</xref>.</p>
<disp-formula id="EQ2"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>T</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>R</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>N</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>Here, <inline-formula><mml:math id="M5"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>N</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> denotes intensity normalization, <inline-formula><mml:math id="M6"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>R</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> represents resizing to 640 &#x000D7; 640 pixels, and <inline-formula><mml:math id="M7"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> applies augmentations such as flipping, rotation, HSV jittering, mosaic, and mixup. This transformation standardizes pixel intensity and improves generalization across subjects and MRI scanners. The YOLOv5m backbone extracts features <italic>F</italic><sub><italic>t</italic></sub> for each batch at epoch <italic>t</italic>. The ESA module refines these features according to <xref ref-type="disp-formula" rid="EQ3">Equation 3</xref>:</p>
<disp-formula id="EQ3"><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>E</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ3">Equation 3</xref>, <italic>f</italic><sub>1</sub>, <italic>f</italic><sub>2</sub>, and <italic>f</italic><sub>3</sub> denote convolutional layers, &#x003B4;(&#x000B7;) is the SiLU activation, and &#x003C3;(&#x000B7;) the sigmoid function generating the spatial weighting map. The enhanced map <inline-formula><mml:math id="M9"><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> strengthens tumor-relevant activations and suppresses background noise before detection. Predictions consist of bounding boxes <inline-formula><mml:math id="M10"><mml:mover accent="true"><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula>, class probabilities &#x00108;, and confidence scores &#x0015C;, obtained as <xref ref-type="disp-formula" rid="EQ4">Equation 4</xref>.</p>
<disp-formula id="EQ4"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mi>&#x00108;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x0015C;</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>The model minimizes a composite loss given by <xref ref-type="disp-formula" rid="EQ5">Equation 5</xref>:</p>
<disp-formula id="EQ5"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">cls</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">cls</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">obj</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">obj</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">ciou</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">ciou</mml:mtext></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p>Parameters are updated iteratively using the AdamW optimizer using <xref ref-type="disp-formula" rid="EQ6">Equation 6</xref>.</p>
<disp-formula id="EQ6"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">total</mml:mtext></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>The training procedure is expressed compactly in <xref ref-type="disp-formula" rid="EQ7">Equation 7</xref>:</p>
<disp-formula id="EQ7"><label>(7)</label><mml:math id="M14"><mml:mrow><mml:mo>&#x02200;</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mo stretchy='false'>[</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>E</mml:mi><mml:mo stretchy='false'>]</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x02004;</mml:mtext><mml:mo>&#x02200;</mml:mo><mml:mi>B</mml:mi><mml:mo>&#x02282;</mml:mo><mml:mi mathvariant='script'>D</mml:mi><mml:mo>:</mml:mo><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left' equalrows='true' equalcolumns='true'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant='script'>B</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>B</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi>&#x003B8;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:msup><mml:mi>t</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant='script'>E</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mo>&#x0007B;</mml:mo><mml:mover accent='true'><mml:mi>B</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mover accent='true'><mml:mi>C</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mover accent='true'><mml:mi>S</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>&#x0007D;</mml:mo><mml:mo>=</mml:mo><mml:mi mathvariant='script'>Y</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:msup><mml:mi>t</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003A6;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mover accent='true'><mml:mi>B</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mover accent='true'><mml:mi>C</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mover accent='true'><mml:mi>S</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>;</mml:mo><mml:mi>B</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>&#x003B8;</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x003B8;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:msub><mml:mo>&#x02207;</mml:mo><mml:mrow><mml:msub><mml:mi>&#x003B8;</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msub><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math></disp-formula>
</sec>
<sec>
<label>4.2</label>
<title>Data loading and preprocessing</title>
<p>The proposed framework uses the Figshare Brain Tumor MRI dataset, containing 3,064 T1-weighted contrast-enhanced MRI images from 233 subjects. The dataset includes three tumor types&#x02014;glioma, meningioma, and pituitary&#x02014;with resolutions of 512 &#x000D7; 512 pixels. Each image is resized to 640 &#x000D7; 640 during training and augmented using transformations defined in <xref ref-type="disp-formula" rid="EQ2">Equation 2</xref>. Dataset statistics are summarized in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Statistics of the Figshare T1-weighted contrast-enhanced brain tumor MRI dataset (3,064 images from 233 subjects).</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Tumor type</bold></th>
<th valign="top" align="left"><bold>No. of images</bold></th>
<th valign="top" align="left"><bold>No. of patients</bold></th>
<th valign="top" align="center"><bold>MRI modality</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Glioma</td>
<td valign="top" align="left">1,426</td>
<td valign="top" align="left">89</td>
<td valign="top" align="center">T1-weighted (CE)</td>
</tr>
<tr>
<td valign="top" align="left">Meningioma</td>
<td valign="top" align="left">708</td>
<td valign="top" align="left">82</td>
<td valign="top" align="center">T1-weighted (CE)</td>
</tr>
<tr>
<td valign="top" align="left">Pituitary</td>
<td valign="top" align="left">930</td>
<td valign="top" align="left">62</td>
<td valign="top" align="center">T1-weighted (CE)</td>
</tr>
<tr>
<td valign="top" align="left">Total</td>
<td valign="top" align="left">3,064</td>
<td valign="top" align="left">233</td>
<td valign="top" align="center">&#x02013;</td>
</tr></tbody>
</table>
</table-wrap>
<p>As part of the preprocessing stage, binary tumor masks assist in confirming the spatial correspondence between the tumor and the respective annotated bounding box. These are the examples presented in <xref ref-type="fig" rid="F3">Figure 3</xref>. The top row contains raw MRI slices, whereas the bottom row contains the respective masks. These masks serve the purpose of assuring the proper placement of bounding boxes around tumor margins before the training stage.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Example MRI slices <bold>(top)</bold> with corresponding tumor masks <bold>(bottom)</bold> from the Figshare dataset, used to validate tumor regions before bounding-box annotation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0003.tif">
<alt-text content-type="machine-generated">MRI image montage showing two rows. The top row contains six brain scans labeled with numbers one or two. The bottom row shows corresponding segmented regions in white on a black background for each scan.</alt-text>
</graphic>
</fig>
<p>Upon completion of the verification process, the labeled MRI images are prepared for YOLOv5 training, as presented in <xref ref-type="fig" rid="F4">Figure 4</xref>. Each of the red bounding boxes represents boxes of detected tumorous areas, and the numbers 1&#x02013;3 refer to the classes of glioma, meningioma, and pituitary. This visual representation proves the persistent annotation agreement in the entire dataset.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Annotated MRI images used for YOLOv5 training, with red bounding boxes marking tumor regions and class labels (1: glioma, 2: meningioma, 3: pituitary).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0004.tif">
<alt-text content-type="machine-generated">Six MRI scans of a human brain, each labeled with red rectangles. Labels 1 and 2 highlight different areas in the brain, while Label 3 highlights a specific region in the center lower section of the scan.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>4.3</label>
<title>Backbone and ESA integration</title>
<p>The YOLOv5m backbone employs a CSP-Darknet53 structure for multi-scale feature extraction. The ESA module is integrated immediately after the SPPF layer, enhancing spatial feature discrimination. The ESA operation can be mathematically expressed as:</p>
<disp-formula id="EQ8"><mml:math id="M15"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>E</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>F</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi><mml:mo>&#x02299;</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>C</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ8">Equation 8</xref>, &#x02299; denotes element-wise multiplication, <inline-formula><mml:math id="M16"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>C</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> represents a convolutional mapping, &#x003C3; is a sigmoid activation producing spatial attention weights, and &#x003B1; is a learnable scaling coefficient. This mechanism emphasizes high-importance tumor regions and suppresses irrelevant background activations. Because ESA is applied only once per forward pass, its computational overhead remains minimal. <xref ref-type="other" rid="algorithm_1">Algorithm 1</xref> describes the stage of refining the feature of the ESA block prior to inserting it into the YOLOv5m backbone.</p>
<statement content-type="algorithm" id="algorithm_1">
<label>Algorithm 1</label>
<p>Enhanced Spatial Attention (ESA) integration.
<preformat>
<monospace>1: Input feature map <italic>F</italic>&#x02208;&#x0211D;<sup><italic>C</italic>&#x000D7;<italic>H</italic>&#x000D7;<italic>W</italic></sup></monospace>
<monospace>2: Compute multi-scale spatial descriptors via 3 &#x000D7; 3 and 5 &#x000D7; 5 convolutions</monospace>
<monospace>3: Fuse descriptors: <italic>S</italic> &#x0003D; &#x003C3;(<italic>Conv</italic>([<italic>F</italic><sub>3 &#x000D7; 3</sub>; <italic>F</italic><sub>5 &#x000D7; 5</sub>]))</monospace>
<monospace>4: Output: <italic>F</italic>&#x02032; &#x0003D; <italic>F</italic>&#x02299;<italic>S</italic></monospace>
</preformat>
</p>
</statement>
</sec>
<sec>
<label>4.4</label>
<title>Training strategy and optimization</title>
<p>Transfer learning is applied using YOLOv5m weights pretrained on the COCO dataset. The model is fine-tuned for 150 epochs using the AdamW optimizer with cosine learning rate scheduling. The dynamic learning rate &#x003B7;<sub><italic>t</italic></sub> varies as per <xref ref-type="disp-formula" rid="EQ9">Equation 9</xref>:</p>
<disp-formula id="EQ9"><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:mo class="qopname">cos</mml:mo><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>t</mml:mi><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:mi>E</mml:mi></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>Here, &#x003B7;<sub>0</sub> is the initial learning rate, <italic>t</italic> is the current epoch, and <italic>E</italic> is the total number of epochs. The functionality of reducing learning rates is to eliminate uncertainty and stabilize training. The batch size is configured to be between 8 and 16. All experiments are repeated with multiple random seeds to ensure reproducibility.</p></sec>
<sec>
<label>4.5</label>
<title>Evaluation metrics</title>
<p>Model performance is quantified using Precision (P), Recall (R), and mean Average Precision at IoU thresholds 0.5 and 0.5:0.95. For each tumor class <italic>c</italic>, the average precision <italic>AP</italic><sub><italic>c</italic></sub> is computed over the precision-recall curve, and mean mAP is given by <xref ref-type="disp-formula" rid="EQ10">Equation 10</xref>:</p>
<disp-formula id="EQ10"><mml:math id="M18"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>m</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:msubsup><mml:mrow><mml:mo>&#x0222B;</mml:mo></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mstyle><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>d</mml:mi><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>Where <italic>C</italic> is the total number of classes, inference latency (milliseconds per image) is also measured to assess real-time feasibility.</p></sec>
<sec>
<label>4.6</label>
<title>Computational efficiency</title>
<p>The incorporation of ESA into the backbone model of YOLOv5m increases the total parameters to just under 4%. ESA retains latency under ten milliseconds per image, signifying real-time performance, and maintains exemplary results with little additional computation. The added ESA has been shown to effectively enhance spatial attention, yielding better Precision, Recall, and mAP values. These traits, along with the framework&#x00027;s features, make the model especially advantageous for use in clinical decision-support systems and for deployment on edge devices. The proposed approach has developed an explainable and sophisticated detection pipeline encompassing efficient data preprocessing, lightweight attention modules, and robust evaluation metrics. In <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref>, the qualitative examples demonstrate the accurate and efficient tumor detection with few false positives, achieved via the model, due to the extensive dataset and annotation pipeline.</p></sec></sec>
<sec id="s5">
<label>5</label>
<title>Experimental setup</title>
<p>All experiments were conducted using Python 3.10 in a Google Colab Pro environment equipped with an NVIDIA A100 GPU (40 GB VRAM), 24 GB of system memory, and approximately 200 GB of allocated disk storage. The entire training pipeline&#x02014;including dataset preprocessing, data loading, model compilation, hyperparameter tuning, and evaluation&#x02014;was executed within this virtualized environment to ensure reproducibility and resource consistency. The code implementation was based on the official YOLOv5 repository, with custom modifications to integrate the Enhanced Spatial Attention (ESA) module immediately after the Spatial Pyramid Pooling-Fast (SPPF) layer in the model backbone. The implementation utilized the PyTorch deep learning framework (v2.2.0) and YOLOv5 (v7.0) for model definition and training, while auxiliary libraries such as NumPy, Pandas, Matplotlib, and OpenCV were used for data handling, visualization, and image processing. Random seeds were fixed across NumPy and PyTorch to maintain deterministic results.</p>
<sec>
<label>5.1</label>
<title>Training configuration</title>
<p>The experimental corpus was derived from the Figshare Brain Tumor MRI dataset, which comprises three classes: glioma, meningioma, and pituitary. Using stratified sampling to maintain the distribution of the classes, the dataset was split into training (80%) and validation (20%) subsets. Every image was resized to 640 &#x000D7; 640 pixels before being sent to the data loader. Training and validation data were processed with the same normalization pipeline, while data augmentation was applied only to the training set. The training was performed for 150 epochs with an initial learning rate &#x003B7;<sub>0</sub> &#x0003D; 0.001 following a cosine annealing schedule defined in <xref ref-type="disp-formula" rid="EQ11">Equation 11</xref>. The AdamW optimizer was used with &#x003B2;<sub>1</sub> &#x0003D; 0.9, &#x003B2;<sub>2</sub> &#x0003D; 0.999, and a weight decay of 1 &#x000D7; 10<sup>&#x02212;4</sup>. A batch size of 8 was used for T4 GPUs and 16 for the A100 configuration. Gradient accumulation was enabled to stabilize optimization during small-batch training, and mixed-precision computation (FP16) was used to accelerate convergence while minimizing GPU memory consumption. Each epoch computed detection losses (classification, objectness, and CIoU) as per <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref>. The best-performing model was selected based on the highest mean Average Precision at 0.5 IoU (mAP&#x00040;0.5) on the validation set.</p></sec>
<sec>
<label>5.2</label>
<title>Data augmentation and preprocessing</title>
<p>Preprocessing involved intensity normalization and resizing to maintain a uniform image scale. Augmentation techniques were applied to increase the diversity of training samples and improve generalization to unseen MRI scans. These transformations included random horizontal flips with probability <italic>p</italic> &#x0003D; 0.5, HSV hue-saturation-value jitter (&#x000B1;0.015), random rotation (&#x000B1;10&#x000B0;), Mosaic augmentation (probability <italic>p</italic> &#x0003D; 0.5), and Mixup blending (&#x003B1; &#x0003D; 0.2). Each augmented sample was normalized to the [0,1] range and standardized using mean subtraction and variance scaling. The final preprocessed tensors were formatted into batches of shape (<italic>B</italic>, 3, 640, 640), where <italic>B</italic> is the batch size. Each label file was encoded in YOLO format {<italic>x</italic><sub><italic>c</italic></sub>, <italic>y</italic><sub><italic>c</italic></sub>, <italic>w, h, c</italic>}, corresponding to the bounding box center coordinates, width, height, and class ID.</p></sec>
<sec>
<label>5.3</label>
<title>Hyperparameters and optimization strategy</title>
<p>Hyperparameter tuning was performed empirically across multiple runs. The learning rate &#x003B7; followed the cosine annealing schedule, using <xref ref-type="disp-formula" rid="EQ11">Equation 11</xref>.</p>
<disp-formula id="EQ11"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:mo class="qopname">cos</mml:mo><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>t</mml:mi><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:mi>E</mml:mi></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>Where <italic>E</italic> is the total number of epochs and <italic>t</italic> denotes the current epoch index. Momentum was set to 0.937, and the weight decay coefficient was 1 &#x000D7; 10<sup>&#x02212;4</sup>. The object confidence threshold for detection was fixed at 0.25, and the Non-Maximum Suppression (NMS) IoU threshold was set to 0.45. During training, a label smoothing factor of 0.1 was used to prevent overfitting and enhance model calibration. The overall loss function <italic>L</italic><sub>total</sub> in <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref> was optimized using backpropagation with automatic mixed precision (AMP) to accelerate computation. Early stopping was employed with a patience of 30 epochs, halting training when no improvement in mAP&#x00040;0.5 was observed.</p></sec>
<sec>
<label>5.4</label>
<title>Evaluation protocol</title>
<p>Evaluation metrics included Precision (P), Recall (R), mean Average Precision at IoU thresholds 0.5 (mAP&#x00040;0.5), and averaged between 0.5:0.95 (mAP&#x00040;0.5 0.95). Inference speed (milliseconds per image) and GPU utilization were recorded to assess computational efficiency. For each trained model, performance was evaluated using five random seeds to confirm reproducibility. Statistical mean and standard deviation values were reported across these runs. Visualization of predictions was performed using Matplotlib, where each detected bounding box was annotated with tumor class, confidence score, and bounding box coordinates. Confusion matrices and Precision-Recall curves were generated to analyze inter-class performance.</p>
<p>Reproducibility, consistency, and equity in assessment are maintained by the thorough setup described in <xref ref-type="table" rid="T4">Table 4</xref>. The variability in tumor forms and levels of brightness in the Figshare dataset has become a standard for the assessment of medical detection models. The mixed-precision training, AdamW optimizer, and cosine annealing postpone convergence and eliminate the risks of gradient vanishing. The ESA module improves detection performance at practically no extra processing expense, which supports real-time inferences on cloud and edge devices.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Experimental setup and training configuration for YOLOv5m &#x0002B; ESA model.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Category</bold></th>
<th valign="top" align="left"><bold>Specification</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Hardware platform</td>
<td valign="top" align="left">Google Colab Pro, NVIDIA A100 GPU (40 GB), 24 GB RAM, 200 GB Disk</td>
</tr>
<tr>
<td valign="top" align="left">Programming language</td>
<td valign="top" align="left">Python 3.10 (PyTorch 2.2.0, YOLOv5 7.0)</td>
</tr>
<tr>
<td valign="top" align="left">Dataset</td>
<td valign="top" align="left">Figshare brain tumor MRI (glioma, meningioma, pituitary)</td>
</tr>
<tr>
<td valign="top" align="left">Image input size</td>
<td valign="top" align="left">640 &#x000D7; 640 pixels (RGB normalized)</td>
</tr>
<tr>
<td valign="top" align="left">Train/validation split</td>
<td valign="top" align="left">80/20 stratified</td>
</tr>
<tr>
<td valign="top" align="left">Batch size</td>
<td valign="top" align="left">8 (T4) &#x02013; 16 (A100)</td>
</tr>
<tr>
<td valign="top" align="left">Epochs</td>
<td valign="top" align="left">150 (early stopping at 30)</td>
</tr>
<tr>
<td valign="top" align="left">Optimizer</td>
<td valign="top" align="left">AdamW (&#x003B2;<sub>1</sub> &#x0003D; 0.9, &#x003B2;<sub>2</sub> &#x0003D; 0.999, weight decay = 1 &#x000D7; 10<sup>&#x02212;4</sup>)</td>
</tr>
<tr>
<td valign="top" align="left">Learning rate</td>
<td valign="top" align="left">&#x003B7;<sub>0</sub> &#x0003D; 0.001, cosine annealing schedule (<xref ref-type="disp-formula" rid="EQ11">Equation 11</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">Momentum</td>
<td valign="top" align="left">0.937</td>
</tr>
<tr>
<td valign="top" align="left">Augmentation</td>
<td valign="top" align="left">Flip (0.5), rotation (&#x000B1;10&#x000B0;), HSV (&#x000B1;0.015), Mosaic (0.5), Mixup (&#x003B1; &#x0003D; 0.2)</td>
</tr>
<tr>
<td valign="top" align="left">Loss function</td>
<td valign="top" align="left">Classification &#x0002B; Objectness &#x0002B; CIoU (<xref ref-type="disp-formula" rid="EQ1">Equation 1</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">Detection thresholds</td>
<td valign="top" align="left">Confidence = 0.25, NMS IoU = 0.45</td>
</tr>
<tr>
<td valign="top" align="left">Label smoothing</td>
<td valign="top" align="left">0.1</td>
</tr>
<tr>
<td valign="top" align="left">Precision type</td>
<td valign="top" align="left">FP16 mixed precision</td>
</tr>
<tr>
<td valign="top" align="left">Metrics evaluated</td>
<td valign="top" align="left">Precision, recall, mAP&#x00040;0.5, mAP&#x00040;0.5&#x02013;0.95, inference time</td>
</tr>
<tr>
<td valign="top" align="left">Evaluation runs</td>
<td valign="top" align="left">5 independent seeds (mean &#x000B1; SD reported)</td>
</tr>
<tr>
<td valign="top" align="left">Visualization tools</td>
<td valign="top" align="left">Matplotlib, pandas, OpenCV</td>
</tr></tbody>
</table>
</table-wrap>
</sec></sec>
<sec id="s6">
<label>6</label>
<title>Results and discussion</title>
<p>This section exhibits a detailed evaluation of the proposed ESA-YOLOv5m framework and its performance relative to the baseline YOLOv5m model. The evaluation centers on the framework&#x00027;s detection accuracy, training convergence, and overall model robustness. The primary metrics to be used include mAP&#x00040;0.5, precision, and recall. These values capture the essence of the models&#x00027; classification and localization functionalities. For reliable and reproducible comparisons, all experiments were designed under the same dataset, training strategy, and hardware constraints.</p>
<sec>
<label>6.1</label>
<title>Overall performance comparison</title>
<p><xref ref-type="fig" rid="F5">Figure 5</xref> presents a comparison of the performance metrics of the baseline and ESA-enhanced YOLOv5m models. The baseline model achieved an mAP&#x00040;0.5 of approximately 0.80 for the mAP, Precision, and Recall metrics, while the ESA-enhanced model exceeded 0.90 for each metric, indicating a large increase in detection capacity. This feature of the model is likely due to the Enhanced Spatial Attention (ESA) module, which concentrates on accentuated tumor areas and background suppression, which improves the accuracy and sensitivity of the detection. Furthermore, the increase in mAP&#x00040;0.5 indicates an improvement in the model&#x00027;s ability to detect and accurately assess the number of tumors present, while Precision and Recall also demonstrate an increase in the number of falsely detected and missed tumors. The detected improvement of 11%&#x02013;12% mAP on the clinical side translates to about 1&#x02013;2 missed tumors per 100 cases compared to the baseline detector per each clinical mAP improvement. More confident and rapid triage decisions, particularly on smaller or early lesions that are often missed during manual assessments, translate to improved diagnostic accuracy.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Visual comparison between Baseline YOLOv5m and YOLOv5m &#x0002B; ESA architecture, showing clearer tumor boundary localization with ESA&#x00027;s spatial attention.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0005.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x0201C;Final Performance Comparison&#x0201D; showing scores for Baseline and ESA Enhanced models in three metrics: mAP@0.5, precision, and recall. Both models have scores above 0.8 for all metrics, with similar performance across each category.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>6.2</label>
<title>Training convergence and mAP progression</title>
<p><xref ref-type="fig" rid="F6">Figure 6</xref> shows both models&#x00027; mAP&#x00040;0.5 on training epochs. The baseline model takes a while to converge. After 50 epochs, it hovers around 0.80 mAP, while the ESA model is on a much steeper learning curve, surpassing 0.85 mAP by epoch 15 and stabilizing at 0.90 mAP by epoch 50. The ESA model performed both faster and better, indicating that ESA does improve feature representation, especially for small, complex tumor boundaries. The ESA block incurs minimal computational cost and is therefore advantageous in medical imaging, as it focuses on the more relevant tumor structures and delivers significant performance improvements. Fine-grained structure localization accuracy is of utmost importance.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>mAP&#x00040;0.5 progression curve for Baseline YOLOv5m (blue solid) and YOLOv5m &#x0002B; ESA (red dashed), confirming improved learning stability. The ESA-enhanced model achieves faster, higher convergence.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0006.tif">
<alt-text content-type="machine-generated">Line graph titled &#x0201C;mAP@0.5 Comparison&#x0201D; showing comparison of mean average precision over epochs. Blue line represents baseline YOLOv5m, and red dashed line represents YOLOv5m with ESA, which shows better performance, achieving higher mAP scores over time. Both lines plateau after 40 epochs.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>6.3</label>
<title>Precision and recall analysis</title>
<p>The classification behavior of models is further explained by the Precision and Recall indicators. We observe the training progresses for Precision and Recall of the baseline and ESA models in <xref ref-type="fig" rid="F7">Figure 7</xref>. The baseline models show close to 0.80 for both metrics, while the ESA models are above 0.90 and consistently outperform the baseline models. The ESA models exhibit fewer false positives and, hence, higher precision, meaning they are more confident in identifying and accurately delineating regions containing tumors. The attention mechanism effectively suppresses background noise and emphasizes the reconciling boundaries of tumors. The ESA models also show higher recall, meaning they are more capable of identifying and retrieving a majority of the actual tumors with the least possible misses. This is critical in most medical cases where a failure to recognize a tumor can cause serious problems with the diagnosis.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Precision and recall comparison between baseline YOLOv5m and YOLOv5m &#x0002B; ESA. The ESA-enhanced model demonstrates higher precision (solid) and recall (dotted) across all epochs, indicating ESA-YOLOv5m achieves fewer false detections and better tumor recovery.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0007.tif">
<alt-text content-type="machine-generated">Line chart titled &#x0201C;Precision &#x00026; Recall Comparison&#x0201D; showing baseline and ESA precision and recall over 50 epochs. Baseline precision and recall are in blue, while ESA precision and recall are in red. Both ESA metrics outperform baseline.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>6.4</label>
<title>Quantitative results</title>
<p><xref ref-type="table" rid="T5">Table 5</xref> captures the results from the last phase of each model&#x00027;s performance metrics. The ESA-enhanced model captures an 11%&#x02013;12% elevation in overall detection performance. This enhancement is sustained through all the runs and random seeds, which confirms the reliability and acceptability of the approach.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Performance comparison between Baseline YOLOv5m and YOLOv5m &#x0002B; ESA.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Metric</bold></th>
<th valign="top" align="center"><bold>Baseline YOLOv5m</bold></th>
<th valign="top" align="center"><bold>YOLOv5m &#x0002B; ESA (ours)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">mAP&#x00040;0.5</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.91</td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="center">0.79</td>
<td valign="top" align="center">0.90</td>
</tr>
<tr>
<td valign="top" align="left">Recall</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.90</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>6.5</label>
<title>Qualitative analysis</title>
<p>Qualitative visualizations further confirm the effectiveness of ESA. The ESA-enhanced model produces more accurate bounding boxes, particularly for small or irregularly shaped tumors. It also exhibits stronger confidence scores and fewer background misclassifications. The baseline model tends to under-detect low-contrast tumors or merge multiple lesions into a single bounding box, whereas the ESA model resolves these ambiguities effectively.</p></sec>
<sec>
<label>6.6</label>
<title>Computational efficiency</title>
<p>ESA&#x00027;s integration adds only a lightweight spatial attention mechanism, with no marked increase in model size nor inference duration. The average latency in image processing is below 10ms, and hence, retains its applicability and speed in real-time clinical usage on an NVIDIA A100 GPU. This clearly shows improvement in performance with no loss in efficiency.</p></sec>
<sec>
<label>6.7</label>
<title>Extended evaluation and ablation study</title>
<p>In order to enhance the verification process of the ESA-YOLOv5m architecture, additional controlled experiments were designed and executed under the same primary training setup parameters. This encompasses ESA placement ablation, class-wise analysis, cross-validation stability, and assessment of the efficiency and stability of ESA placement. The additional results retain the coherence with the primary results as discussed in Section 6.</p>
<sec>
<label>6.7.1</label>
<title>Ablation on ESA position</title>
<p>In <xref ref-type="table" rid="T6">Table 6</xref> and <xref ref-type="fig" rid="F8">Figure 8</xref>, an ablation study analyzes the integration of the ESA module&#x00027;s positioning toward various levels of the backbone of YOLOv5m. Placing ESA right after the SPPF layer of the model achieves the most cost-effective, accurate approach. Inserting ESA at lower levels results in lower model performance with respect to localization because there is not enough high-level spatial context.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Ablation analysis of ESA placement in YOLOv5m.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Variant</bold></th>
<th valign="top" align="left"><bold>ESA placement</bold></th>
<th valign="top" align="center"><bold>mAP&#x00040;0.5</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Baseline YOLOv5m</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.79</td>
<td valign="top" align="center">0.80</td>
</tr>
<tr>
<td valign="top" align="left">ESA (before SPPF)</td>
<td valign="top" align="left">Early stage</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.87</td>
</tr>
<tr>
<td valign="top" align="left">ESA (after SPPF) (proposed)</td>
<td valign="top" align="left">Late stage</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.90</td>
</tr></tbody>
</table>
</table-wrap>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Ablation study showing performance of YOLOv5m variants with different ESA placements. It shows that placing ESA after the SPPF layer yields the best balance of accuracy and efficiency. Integrating ESA after the SPPF layer provides the most stable and accurate performance without additional complexity.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0008.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x0201C;Ablation Study: ESA Placement&#x0201D; showing scores for mAP@0.5, Precision, and Recall across three stages: Baseline, ESA (before SPPF), and ESA (after SPPF). Scores increase across the stages for all metrics, with the highest values in the ESA (after SPPF) stage. Each stage has bars for mAP@0.5 in blue, Precision in green, and Recall in gray.</alt-text>
</graphic>
</fig>
<p>The results verify that applying spatial attention to deep aggregated features enhances tumor localization and detection precision while maintaining a lightweight architecture.</p></sec>
<sec>
<label>6.7.2</label>
<title>Class-wise evaluation</title>
<p>In <xref ref-type="table" rid="T7">Table 7</xref> and <xref ref-type="fig" rid="F9">Figure 9</xref>, precision, recall, and mAP values per class for glioma, meningioma, and pituitary tumors are presented. Improvements in performance are observed for all types of tumors, with pituitary tumors exhibiting the best detection accuracy due to defined structural borders, while gliomas have the most difficulty due to their diffuse and irregular shapes.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Per-class detection performance of ESA-YOLOv5m.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Tumor class</bold></th>
<th valign="top" align="left"><bold>Precision</bold></th>
<th valign="top" align="left"><bold>Recall</bold></th>
<th valign="top" align="left"><bold>mAP&#x00040;0.5</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Glioma</td>
<td valign="top" align="left">0.874</td>
<td valign="top" align="left">0.868</td>
<td valign="top" align="left">0.891</td>
</tr>
<tr>
<td valign="top" align="left">Meningioma</td>
<td valign="top" align="left">0.863</td>
<td valign="top" align="left">0.851</td>
<td valign="top" align="left">0.876</td>
</tr>
<tr>
<td valign="top" align="left">Pituitary</td>
<td valign="top" align="left">0.976</td>
<td valign="top" align="left">0.965</td>
<td valign="top" align="left">0.979</td>
</tr></tbody>
</table>
</table-wrap>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Class-wise performance comparison of ESA-YOLOv5m on glioma, meningioma, and pituitary tumor categories. Consistent improvements in all classes highlight the generalization capability of the proposed attention integration.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0009.tif">
<alt-text content-type="machine-generated">Bar chart showing class-wise detection performance for Glioma, Meningioma, and Pituitary. Precision, Recall, and mAP@0.5 are displayed with different bars. Pituitary has the highest scores across all metrics, followed by Meningioma and Glioma.</alt-text>
</graphic>
</fig>
<p>The consistent progress across tumor types validates that the Esa module enhances general spatial awareness rather than concentrating on any one type.</p></sec>
<sec>
<label>6.7.3</label>
<title>Cross-validation stability</title>
<p>A five-fold cross-validation experiment was conducted to assess the stability of model training. The mean and standard deviation (SD) of key performance metrics are summarized in <xref ref-type="table" rid="T8">Table 8</xref>. Low SD values indicate consistent convergence and robust generalization across folds. A paired t-test was used to verify the improvement of ESA-YOLOv5m over the baseline, which is statistically significant (<italic>p</italic> &#x0003C; 0.001), proving that the gains are strong and not the outcome of chance fluctuations. In addition, results from the 5-fold cross validation (see <xref ref-type="fig" rid="F10">Figure 10</xref>) show low variance across all mAP&#x00040;0.5, Precision, and Recall, indicating model&#x00027;s consistency, enabling the model&#x00027;s stability and generalization.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>Baseline vs. ESA-YOLOv5m statistical comparison across five folds (mean &#x000B1; SD).</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>mAP&#x00040;0.5 (mean &#x000B1;SD)</bold></th>
<th valign="top" align="left"><bold><italic>p</italic>-value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Baseline YOLOv5m</td>
<td valign="top" align="center">0.480 &#x000B1; 0.0048</td>
<td valign="top" align="left">&#x0003C;0.001</td>
</tr>
 <tr>
<td valign="top" align="left">ESA-YOLOv5m</td>
<td valign="top" align="center">0.560 &#x000B1; 0.0045</td>
<td/>
</tr></tbody>
</table>
</table-wrap>
<fig position="float" id="F10">
<label>Figure 10</label>
<caption><p>Cross-validation stability results of ESA-YOLOv5m. Error bars represent the standard deviation across five folds, demonstrating consistent and stable model convergence.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0010.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x0201C;Cross-Validation Stability (5-Fold)&#x0201D; comparing three metrics: mAP@0.5, Precision, and Recall. mAP@0.5 has the highest mean around 0.91. Precision and Recall are slightly lower, close to 0.9. Error bars indicate variability.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>6.7.4</label>
<title>Computational efficiency</title>
<p>In order to identify what is truly feasible in real-time, the computational burden of the model was assessed against the baseline YOLOv5m. The ESA integration adds almost negligible overhead under 4.3% relative increase in model parameters and less than 1 ms latency per image, and yields 11% improvement in detection accuracy (see <xref ref-type="table" rid="T9">Table 9</xref>). The more detailed analysis confirms the findings that the ESA integration with YOLOv5m leads to a systematic improvement of detection accuracy and sensitivity without compromising the real-time inference capabilities. The improvements in mAP&#x00040;0.5 of more than 11% and the stable performance during the cross-validation reinforce the view that selective spatial attention is a novel, powerful, and computationally economical approach to enhance medical object detection in the context of MRI-based tumor analyses.</p>
<table-wrap position="float" id="T9">
<label>Table 9</label>
<caption><p>Efficiency comparison between baseline and ESA-enhanced YOLOv5m.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Parameters (M)</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
<th valign="top" align="left"><bold>Latency (ms)</bold></th>
<th valign="top" align="center"><bold>mAP&#x00040;0.5</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Baseline YOLOv5m</td>
<td valign="top" align="left">21.2</td>
<td valign="top" align="center">105</td>
<td valign="top" align="left">9.3</td>
<td valign="top" align="center">0.80</td>
</tr>
<tr>
<td valign="top" align="left">ESA-YOLOv5m (proposed)</td>
<td valign="top" align="left">22.1</td>
<td valign="top" align="center">97</td>
<td valign="top" align="left">9.9</td>
<td valign="top" align="center">0.91</td>
</tr></tbody>
</table>
</table-wrap>
<p>The proposed ESA-YOLOv5m is better suited to provide high detection accuracy at the cost of real-time inference speed, as shown in <xref ref-type="fig" rid="F11">Figure 11</xref>. This confirms that the addition of the ESA block does not affect the performance in a negative way without affecting the computational efficiency.</p>
<fig position="float" id="F11">
<label>Figure 11</label>
<caption><p>Accuracy-latency trade-off between baseline YOLOv5m and ESA-YOLOv5m. The ESA variant achieves higher mAP&#x00040;0.5 with negligible increase in inference latency, demonstrating real-time suitability for clinical deployment.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0011.tif">
<alt-text content-type="machine-generated">Line graph titled &#x0201C;Accuracy-Latency Trade-off,&#x0201D; showing a positive correlation between inference latency and mAP@0.5 percentage. YOLOv5m at 9.4 ms/image with 80% accuracy, and ESA-YOLOv5m at 9.8 ms/image with 90% accuracy.</alt-text>
</graphic>
</fig></sec></sec>
<sec>
<label>6.8</label>
<title>Model interpretability</title>
<p>Grad-CAM visualizations were produced on sample MRI slices to estimate the attention to clinically relevant areas of the proposed ESA-YOLOv5m model. The heatmaps in <xref ref-type="fig" rid="F12">Figure 12</xref> show that ESA-YOLOv5m exhibits a more localized and concentrated activation in the tumor core and its pathological areas than the baseline YOLOv5m model. This increased transparency gives a visual representation of how the model makes decisions, and thus, enhances trust and helps to integrate the model into diagnostic processes.</p>
<fig position="float" id="F12">
<label>Figure 12</label>
<caption><p>Grad-CAM visualizations comparing baseline YOLOv5m and the proposed ESA-YOLOv5m model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1733180-g0012.tif">
<alt-text content-type="machine-generated">MRI scan on the left shows a grayscale brain image. The center and right images are color heatmaps from YOLOv5m and ESA-YOLOv5m models, respectively, highlighting a central area with red, suggesting concentrations or intensities.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>6.9</label>
<title>Comparison with state-of-the-art</title>
<p>The inclusion of the ESA layer immediately after the SPPF block of the ESA-enhanced YOLOv5m model increases feature map responsiveness by allowing the model to adaptively focus on critical tumor areas of the feature maps, attenuating the less important background. This contrast between the ESA-enhanced YOLOv5m model and the baseline YOLOv5m snapshot.</p>
<list list-type="bullet">
<list-item><p>Finer detection accuracy: our model achieves a mAP&#x00040;0.5 of 0.924, which is 0.008 higher than the baseline model&#x00027;s 0.916.</p></list-item>
<list-item><p>Higher precision: our model&#x00027;s precision indicator improved from 0.861 to 0.920, which signifies a reduction of false positive results.</p></list-item>
<list-item><p>Consistent reap performance: our model&#x00027;s recall performance slightly increased from 0.876 to 0.878.</p></list-item>
<list-item><p>Class-specific gains: for meningioma, precision improved from 0.777 to 0.863, for pituitary from 0.945 to 0.976, and for glioma from 0.867 to 0.874, and mAP for each class improved correspondingly.</p></list-item>
<list-item><p>Compact structure: model features a 4.25% increase in parameter count, which is from 21.2M to 22.1M, and 0.6 ms for inference time.</p></list-item>
</list>
<p>These results demonstrate that the proposed model provides measurable performance gains over prior CNN and YOLO-based approaches while preserving computational efficiency. As summarized in <xref ref-type="table" rid="T10">Table 10</xref>, the ESA-YOLOv5m model surpasses the best-performing prior attention-based models across multiple key metrics with minimal overhead, making it more suitable for real-time clinical applications. In contrast to prior works that often prioritized accuracy at the expense of speed and practicality, the proposed ESA-YOLOv5m achieves both high accuracy and real-time performance. Its lightweight attention mechanism offers a better balance between model complexity and clinical applicability. This makes it a promising candidate for deployment in real-world diagnostic workflows, enabling earlier detection and improved treatment outcomes for patients with brain tumors.</p>
<table-wrap position="float" id="T10">
<label>Table 10</label>
<caption><p>Comparison of prior works and proposed ESA-YOLOv5m model on Figshare dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Study/model</bold></th>
<th valign="top" align="left"><bold>Architecture</bold></th>
<th valign="top" align="left"><bold>Precision</bold></th>
<th valign="top" align="left"><bold>Recall</bold></th>
<th valign="top" align="left"><bold>mAP&#x00040;0.5</bold></th>
<th valign="top" align="left"><bold>Remarks</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Yildirim et al.</td>
<td valign="top" align="left">ResNet50 hybrid</td>
<td valign="top" align="left">0.931</td>
<td valign="top" align="left">0.810</td>
<td valign="top" align="left">0.910</td>
<td valign="top" align="left">High cost, good precision</td>
</tr>
<tr>
<td valign="top" align="left">Sarala et al.</td>
<td valign="top" align="left">Dense CNN</td>
<td valign="top" align="left">0.920</td>
<td valign="top" align="left">0.800</td>
<td valign="top" align="left">0.902</td>
<td valign="top" align="left">Strong features, slower</td>
</tr>
<tr>
<td valign="top" align="left">Dutta et al.</td>
<td valign="top" align="left">ARM-Net</td>
<td valign="top" align="left">0.938</td>
<td valign="top" align="left">0.818</td>
<td valign="top" align="left">0.915</td>
<td valign="top" align="left">Multiscale attention</td>
</tr>
<tr>
<td valign="top" align="left">Kang et al.</td>
<td valign="top" align="left">BGF-YOLOv8</td>
<td valign="top" align="left">0.908</td>
<td valign="top" align="left">0.860</td>
<td valign="top" align="left">0.912</td>
<td valign="top" align="left">Heavy model</td>
</tr>
<tr>
<td valign="top" align="left">Proposed Model</td>
<td valign="top" align="left">YOLOv5m &#x0002B; ESA</td>
<td valign="top" align="left">0.91</td>
<td valign="top" align="left">0.90</td>
<td valign="top" align="left">0.91</td>
<td valign="top" align="left">Lightweight, improved accuracy</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>6.10</label>
<title>Discussion</title>
<p>The experimental results confirm that incorporating ESA into the YOLOv5m architecture significantly improves brain tumor detection performance. The enhanced mAP, Precision, and Recall highlight the network&#x00027;s improved ability to focus on diagnostically important features while minimizing false alarms. This is especially valuable for healthcare scenarios, where accurate and fast tumor detection can assist radiologists and reduce manual screening time. The ESA model, in addition to demonstrating stable convergence, shows consistent improvements across runs, which is a promising sign for real-time deployment within medical imaging workflows. The ESA model&#x00027;s convergence also enables it to be scaled for edge deployments or fused within hospital ecosystem diagnostic lite systems. These results confirm that lightweight attention modules, when applied in a judiciously designed framework, can considerably enhance object detection in highly specialized tasks, such as brain tumor imaging, with maintained speed and scalability.</p>
<p>Even though the results are promising, several limitations with the dataset must be addressed. The Figshare dataset presents class imbalance (i.e., glioma &#x0003E; meningioma), scanner-specific acquisition parameters, and poor demographic variety, all of which may affect generalizability across other institutions. Future work should focus on validating ESA-YOLOv5m on multi-center, multi-site, and multi-vendor cohorts to address these biases. Furthermore, the proposed ESA-YOLOv5m framework is currently set up to work on separate 2D MRI slices, although it is capable of high detection results. This method lacks the ability to take volumetric continuity between adjacent slices, which may be of significance to identifying diffuse or infiltrative tumor structures. In the future, we will extend ESA-YOLOv5m to 3D volumetric tumor detection and multi-modal fusion (MRI, CT, and PET), such that holistic spatial-temporal tumor characterization can be used to achieve more precise diagnostic results.</p></sec></sec>
<sec id="s7">
<label>7</label>
<title>Conclusion and future work</title>
<p>This study presented a lightweight and effective deep learning framework for brain tumor detection by integrating an Enhanced Spatial Attention (ESA) module into the YOLOv5m architecture. The proposed ESA-YOLOv5m model addresses key limitations of conventional object detection methods in medical imaging&#x02014;particularly their difficulty in detecting small or low-contrast tumor regions and their computational inefficiency in clinical settings. Qualitative results further confirmed the model&#x00027;s improved capability in detecting small and irregularly shaped tumor regions. Due to the low latency of ESA-YOLOv5m (i.e., &#x0003C;10 ms per image) and its modest parameter footprint, it can be embedded into Radiology Workstations and Edge Devices for Dispersed On-site Screening. This is especially useful when the cloud is inaccessible, and aid in diagnosing low-resourced rural hospitals can be provided. Furthermore, its ability to integrate into PAC systems for automated lesion detection can be utilized directly on the device, augmenting on-device diagnostic support. In the future, we will incorporate complementary imaging modalities (e.g., CT, PET) alongside MRI, and extend the model to support multi-label segmentation and classification along with 3D MRI volumes (rather than 2D slices). We also aim to incorporate explainable AI (XAI) techniques to highlight regions influencing predictions, improving interpretability, and clinical acceptance.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>MFA: Formal analysis, Resources, Writing &#x02013; review &#x00026; editing, Funding acquisition, Conceptualization. NT: Validation, Methodology, Investigation, Writing &#x02013; original draft. MH: Methodology, Writing &#x02013; review &#x00026; editing, Validation, Project administration, Supervision. HA: Data curation, Investigation, Software, Writing &#x02013; review &#x00026; editing, Project administration. MA: Data curation, Writing &#x02013; review &#x00026; editing, Methodology, Investigation.</p>
</sec>
<ack><title>Acknowledgments</title><p>This work was funded by the Deanship of Graduate Studies and Scientific Research at Jouf University under grant No. (DGSSR-2025-FC-01029).</p></ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<ref id="B1">
<label>1.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Almufareh</surname> <given-names>MF</given-names></name> <name><surname>Tariq</surname> <given-names>N</given-names></name> <name><surname>Humayun</surname> <given-names>M</given-names></name> <name><surname>Khan</surname> <given-names>FA</given-names></name></person-group>. <article-title>Melanoma identification and classification model based on fine-tuned convolutional neural network</article-title>. <source>Digit Health</source>. (<year>2024</year>) <volume>10</volume>:<fpage>20552076241253757</fpage>. doi: <pub-id pub-id-type="doi">10.1177/20552076241253757</pub-id><pub-id pub-id-type="pmid">38798885</pub-id></mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rastogi</surname> <given-names>D</given-names></name> <name><surname>Johri</surname> <given-names>P</given-names></name> <name><surname>Donelli</surname> <given-names>M</given-names></name> <name><surname>Kumar</surname> <given-names>L</given-names></name> <name><surname>Bindewari</surname> <given-names>S</given-names></name> <name><surname>Raghav</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Brain tumor detection and prediction in MRI images utilizing a fine-tuned transfer learning model integrated within deep learning frameworks</article-title>. <source>Life</source>. (<year>2025</year>) <volume>15</volume>:<fpage>327</fpage>. doi: <pub-id pub-id-type="doi">10.3390/life15030327</pub-id><pub-id pub-id-type="pmid">40141673</pub-id></mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rahman</surname> <given-names>MA</given-names></name> <name><surname>Masum</surname> <given-names>MI</given-names></name> <name><surname>Hasib</surname> <given-names>KM</given-names></name> <name><surname>Mridha</surname> <given-names>MF</given-names></name> <name><surname>Alfarhood</surname> <given-names>S</given-names></name> <name><surname>Safran</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>GliomaCNN: an effective lightweight CNN model in assessment of classifying brain tumor from magnetic resonance images using explainable AI</article-title>. <source>CMES-Comput Model Eng Sci</source>. (<year>2024</year>) <volume>140</volume>:<fpage>2425</fpage>&#x02013;<lpage>448</lpage>. doi: <pub-id pub-id-type="doi">10.32604/cmes.2024.050760</pub-id></mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kumar</surname> <given-names>R</given-names></name> <name><surname>Sporn</surname> <given-names>K</given-names></name> <name><surname>Khanna</surname> <given-names>A</given-names></name> <name><surname>Paladugu</surname> <given-names>P</given-names></name> <name><surname>Gowda</surname> <given-names>C</given-names></name> <name><surname>Ngo</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Integrating radiogenomics and machine learning in musculoskeletal oncology care</article-title>. <source>Diagnostics</source>. (<year>2025</year>) <volume>15</volume>:<fpage>1377</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics15111377</pub-id><pub-id pub-id-type="pmid">40506947</pub-id></mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kadhim</surname> <given-names>YA</given-names></name> <name><surname>Khan</surname> <given-names>MU</given-names></name> <name><surname>Mishra</surname> <given-names>A</given-names></name></person-group>. <article-title>Deep learning-based computer-aided diagnosis (CAD): applications for medical image datasets</article-title>. <source>Sensors</source>. (<year>2022</year>) <volume>22</volume>:<fpage>8999</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s22228999</pub-id><pub-id pub-id-type="pmid">36433595</pub-id></mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Berghout</surname> <given-names>T</given-names></name></person-group>. <article-title>The neural frontier of future medical imaging: a review of deep learning for brain tumor detection</article-title>. <source>J Imaging</source>. (<year>2024</year>) <volume>11</volume>:<fpage>2</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jimaging11010002</pub-id><pub-id pub-id-type="pmid">39852315</pub-id></mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bouhafra</surname> <given-names>S</given-names></name> <name><surname>El Bahi</surname> <given-names>H</given-names></name></person-group>. <article-title>Deep learning approaches for brain tumor detection and classification using MRI images (2020 to 2024): a systematic review</article-title>. <source>J Imaging Inform Med</source>. (<year>2025</year>) <volume>38</volume>:<fpage>1403</fpage>&#x02013;<lpage>33</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10278-024-01283-8</pub-id><pub-id pub-id-type="pmid">39349785</pub-id></mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kang</surname> <given-names>S</given-names></name> <name><surname>Hu</surname> <given-names>Z</given-names></name> <name><surname>Liu</surname> <given-names>L</given-names></name> <name><surname>Zhang</surname> <given-names>K</given-names></name> <name><surname>Cao</surname> <given-names>Z</given-names></name></person-group>. <article-title>Object detection YOLO algorithms and their industrial applications: overview and comparative analysis</article-title>. <source>Electronics</source>. (<year>2025</year>) <volume>14</volume>:<fpage>1104</fpage>. doi: <pub-id pub-id-type="doi">10.3390/electronics14061104</pub-id></mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saranya</surname> <given-names>M</given-names></name> <name><surname>Praveena</surname> <given-names>R</given-names></name></person-group>. <article-title>Accurate and real-time brain tumour detection and classification using optimized YOLOv5 architecture</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>25286</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-07773-1</pub-id><pub-id pub-id-type="pmid">40651993</pub-id></mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Khan</surname> <given-names>S</given-names></name> <name><surname>Almas</surname> <given-names>B</given-names></name> <name><surname>Tariq</surname> <given-names>N</given-names></name> <name><surname>Haq</surname> <given-names>FU</given-names></name> <name><surname>Faisal</surname> <given-names>A</given-names></name> <name><surname>Kumar</surname> <given-names>P</given-names></name></person-group>. <article-title>Emotion recognition of human speech using different optimizer techniques</article-title>. In: <source>2024 International Conference on Emerging Trends in Networks and Computer Communications (ETNCC)</source>. <publisher-loc>Windhoek</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2024</year>). p. <fpage>1</fpage>&#x02013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ETNCC63262.2024.10767445</pub-id></mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nadeem</surname> <given-names>MW</given-names></name> <name><surname>Ghamdi</surname> <given-names>MAA</given-names></name> <name><surname>Hussain</surname> <given-names>M</given-names></name> <name><surname>Khan</surname> <given-names>MA</given-names></name> <name><surname>Khan</surname> <given-names>KM</given-names></name> <name><surname>Almotiri</surname> <given-names>SH</given-names></name> <etal/></person-group>. <article-title>Brain tumor analysis empowered with deep learning: a review, taxonomy, and future challenges</article-title>. <source>Brain Sci</source>. (<year>2020</year>) <volume>10</volume>:<fpage>118</fpage>. doi: <pub-id pub-id-type="doi">10.3390/brainsci10020118</pub-id><pub-id pub-id-type="pmid">32098333</pub-id></mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guder</surname> <given-names>O</given-names></name> <name><surname>Cetin-Kaya</surname> <given-names>Y</given-names></name></person-group>. <article-title>Optimized attention-based lightweight CNN using particle swarm optimization for brain tumor classification</article-title>. <source>Biomed Signal Process Control</source>. (<year>2025</year>) <volume>100</volume>:<fpage>107126</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2024.107126</pub-id></mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Musthafa</surname> <given-names>N</given-names></name> <name><surname>Memon</surname> <given-names>QA</given-names></name> <name><surname>Masud</surname> <given-names>MM</given-names></name></person-group>. <article-title>Advancing brain tumor analysis: current trends, key challenges, and perspectives in deep learning-based brain MRI Tumor diagnosis</article-title>. <source>Eng</source>. (<year>2025</year>) <volume>6</volume>:<fpage>82</fpage>. doi: <pub-id pub-id-type="doi">10.3390/eng6050082</pub-id></mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mathivanan</surname> <given-names>SK</given-names></name> <name><surname>Srinivasan</surname> <given-names>S</given-names></name> <name><surname>Koti</surname> <given-names>MS</given-names></name> <name><surname>Kushwah</surname> <given-names>VS</given-names></name> <name><surname>Joseph</surname> <given-names>RB</given-names></name> <name><surname>Shah</surname> <given-names>MA</given-names></name></person-group>. <article-title>A secure hybrid deep learning framework for brain tumor detection and classification</article-title>. <source>J Big Data</source>. (<year>2025</year>) <volume>12</volume>:<fpage>72</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s40537-025-01117-6</pub-id></mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hussain</surname> <given-names>L</given-names></name> <name><surname>Jafar</surname> <given-names>A</given-names></name> <name><surname>Khan</surname> <given-names>A</given-names></name> <name><surname>Idris</surname> <given-names>A</given-names></name> <name><surname>Nadeem</surname> <given-names>MSA</given-names></name> <name><surname>Chaudhry</surname> <given-names>QU</given-names></name></person-group>. <article-title>Detecting brain tumor using machine learning techniques based on different feature extracting strategies</article-title>. <source>Curr Med Imaging</source>. (<year>2019</year>) <volume>15</volume>:<fpage>595</fpage>&#x02013;<lpage>606</lpage>. doi: <pub-id pub-id-type="doi">10.2174/1573405614666180718123533</pub-id></mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Muhammad</surname> <given-names>K</given-names></name> <name><surname>Sharif</surname> <given-names>M</given-names></name> <name><surname>Anjum</surname> <given-names>A</given-names></name> <name><surname>Albuquerque</surname> <given-names>VHC</given-names></name></person-group>. <article-title>Deep learning for multigrade brain tumor classification in smart healthcare systems: a prospective survey</article-title>. <source>IEEE Trans Neural Netw Learn Syst</source>. (<year>2020</year>) <volume>32</volume>:<fpage>507</fpage>&#x02013;<lpage>22</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2020.2995800</pub-id><pub-id pub-id-type="pmid">32603291</pub-id></mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yildirim</surname> <given-names>M</given-names></name> <name><surname>Cengil</surname> <given-names>E</given-names></name> <name><surname>Eroglu</surname> <given-names>Y</given-names></name> <name><surname>Cinar</surname> <given-names>A</given-names></name></person-group>. <article-title>Detection and classification of glioma, meningioma, pituitary tumor, and normal in brain magnetic resonance imaging using deep learning-based hybrid model</article-title>. <source>Iran J Comput Sci</source>. (<year>2023</year>) <volume>6</volume>:<fpage>455</fpage>&#x02013;<lpage>64</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s42044-023-00139-8</pub-id></mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hekmat</surname> <given-names>A</given-names></name> <name><surname>Zhang</surname> <given-names>Z</given-names></name> <name><surname>Khan</surname> <given-names>SUR</given-names></name> <name><surname>Shad</surname> <given-names>I</given-names></name> <name><surname>Bilal</surname> <given-names>O</given-names></name></person-group>. <article-title>An attention-fused architecture for brain tumor diagnosis</article-title>. <source>Biomed Signal Process Control</source>. (<year>2025</year>) <volume>101</volume>:<fpage>107221</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2024.107221</pub-id></mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dutta</surname> <given-names>TK</given-names></name> <name><surname>Nayak</surname> <given-names>DR</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name></person-group>. <article-title>ARM-Net: Attention-guided residual multiscale CNN for multiclass brain tumor classification using MR images</article-title>. <source>Biomed Signal Process Control</source>. (<year>2024</year>) <volume>87</volume>:<fpage>105421</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2023.105421</pub-id></mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Poornam</surname> <given-names>S</given-names></name> <name><surname>Angelina</surname> <given-names>JJR</given-names></name></person-group>. <article-title>VITALT: a robust and efficient brain tumor detection system using vision transformer with attention and linear transformation</article-title>. <source>Neural Comput Appl</source>. (<year>2024</year>) <volume>36</volume>:<fpage>6403</fpage>&#x02013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00521-023-09306-1</pub-id></mixed-citation>
</ref>
<ref id="B21">
<label>21.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saeed</surname> <given-names>A</given-names></name> <name><surname>Shehzad</surname> <given-names>K</given-names></name> <name><surname>Bhatti</surname> <given-names>SS</given-names></name> <name><surname>Ahmed</surname> <given-names>S</given-names></name> <name><surname>Azar</surname> <given-names>AT</given-names></name></person-group>. <article-title>GGLA-NeXtE2NET: a dual-branch ensemble network with gated global-local attention for enhanced brain tumor recognition</article-title>. <source>IEEE Access</source>. (<year>2025</year>) <volume>13</volume>:<fpage>1</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3525518</pub-id></mixed-citation>
</ref>
<ref id="B22">
<label>22.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bezdan</surname> <given-names>T</given-names></name> <name><surname>Zivkovic</surname> <given-names>M</given-names></name> <name><surname>Tuba</surname> <given-names>E</given-names></name> <name><surname>Strumberger</surname> <given-names>I</given-names></name> <name><surname>Bacanin</surname> <given-names>N</given-names></name> <name><surname>Tuba</surname> <given-names>M</given-names></name></person-group>. <article-title>Glioma brain tumor grade classification from MRI using convolutional neural networks designed by modified FA</article-title>. In: <source>International Conference on Intelligent and Fuzzy Systems</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2020</year>). p. <fpage>955</fpage>&#x02013;<lpage>63</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-030-51156-2_111</pub-id></mixed-citation>
</ref>
<ref id="B23">
<label>23.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bezdan</surname> <given-names>T</given-names></name> <name><surname>Milosevic</surname> <given-names>S</given-names></name> <name><surname>Zivkovic</surname> <given-names>M</given-names></name> <name><surname>Bacanin</surname> <given-names>N</given-names></name> <name><surname>Strumberger</surname> <given-names>I</given-names></name> <etal/></person-group>. <article-title>Optimizing convolutional neural network by hybridized elephant herding optimization algorithm for magnetic resonance image classification of glioma brain tumor grade</article-title>. In: <source>2021 Zooming Innovation in Consumer Technologies Conference (ZINC)</source>. Novi Sad: IEEE (<year>2021</year>). p. <fpage>171</fpage>&#x02013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ZINC52049.2021.9499297</pub-id></mixed-citation>
</ref>
<ref id="B24">
<label>24.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jankovic</surname> <given-names>P</given-names></name> <name><surname>Proti&#x00107;</surname> <given-names>M</given-names></name> <name><surname>Jovanovic</surname> <given-names>L</given-names></name> <name><surname>Bacanin</surname> <given-names>N</given-names></name> <name><surname>Zivkovic</surname> <given-names>M</given-names></name> <name><surname>Kaljevic</surname> <given-names>J</given-names></name></person-group>. <article-title>Yolov8 utilization in occupational health and safety</article-title>. In: <source>2024 Zooming Innovation in Consumer Technologies Conference (ZINC)</source>. Novi Sad: IEEE (<year>2024</year>). p. <fpage>182</fpage>&#x02013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ZINC61849.2024.10579310</pub-id></mixed-citation>
</ref>
<ref id="B25">
<label>25.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rajinikanth</surname> <given-names>V</given-names></name> <name><surname>Vincent</surname> <given-names>PDR</given-names></name> <name><surname>Gnanaprakasam</surname> <given-names>C</given-names></name> <name><surname>Srinivasan</surname> <given-names>K</given-names></name> <name><surname>Chang</surname> <given-names>CY</given-names></name></person-group>. <article-title>Brain tumor class detection in flair/T2 modality MRI slices using elephant-herd algorithm optimized features</article-title>. <source>Diagnostics</source>. (<year>2023</year>) <volume>13</volume>:<fpage>1832</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics13111832</pub-id><pub-id pub-id-type="pmid">37296683</pub-id></mixed-citation>
</ref>
<ref id="B26">
<label>26.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Srinivasan</surname> <given-names>S</given-names></name> <name><surname>Francis</surname> <given-names>D</given-names></name> <name><surname>Mathivanan</surname> <given-names>SK</given-names></name> <name><surname>Rajadurai</surname> <given-names>H</given-names></name> <name><surname>Shivahare</surname> <given-names>BD</given-names></name> <name><surname>Shah</surname> <given-names>MA</given-names></name></person-group>. <article-title>A hybrid deep CNN model for brain tumor image multi-classification</article-title>. <source>BMC Med Imaging</source>. (<year>2024</year>) <volume>24</volume>:<fpage>21</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12880-024-01195-7</pub-id><pub-id pub-id-type="pmid">38243215</pub-id></mixed-citation>
</ref>
<ref id="B27">
<label>27.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zivkovic</surname> <given-names>T</given-names></name> <name><surname>Zivkovic</surname> <given-names>M</given-names></name> <name><surname>Jovanovic</surname> <given-names>L</given-names></name> <name><surname>Kaljevic</surname> <given-names>J</given-names></name> <name><surname>Dobrojevic</surname> <given-names>M</given-names></name> <name><surname>Bacanin</surname> <given-names>N</given-names></name></person-group>. <article-title>YOLOv8 model architecture selection for human fall detection</article-title>. In: <source>International Conference on Data Analytics and Management</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2024</year>). p. <fpage>219</fpage>&#x02013;<lpage>27</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-981-96-3372-2_15</pub-id></mixed-citation>
</ref>
<ref id="B28">
<label>28.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>J</given-names></name> <name><surname>Shen</surname> <given-names>L</given-names></name> <name><surname>Sun</surname> <given-names>G</given-names></name></person-group>. <article-title>Squeeze-and-excitation networks</article-title>. In: <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>. <publisher-loc>Salt Lake City, UT</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2018</year>). p. <fpage>7132</fpage>&#x02013;<lpage>41</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2018.00745</pub-id><pub-id pub-id-type="pmid">31034408</pub-id></mixed-citation>
</ref>
<ref id="B29">
<label>29.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Woo</surname> <given-names>S</given-names></name> <name><surname>Park</surname> <given-names>J</given-names></name> <name><surname>Lee</surname> <given-names>JY</given-names></name> <name><surname>Kweon</surname> <given-names>IS</given-names></name></person-group>. <article-title>CBAM: convolutional block attention module</article-title>. In: <source>Proceedings of the European conference on Computer Vision (ECCV)</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2018</year>). p. <fpage>3</fpage>&#x02013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_1</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1573164/overview">Nebojsa Bacanin</ext-link>, Singidunum University, Serbia</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2184974/overview">Miodrag Zivkovic</ext-link>, Singidunum University, Serbia</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3259864/overview">Farheen Siddiqui</ext-link>, Jamia Hamdard University, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3262684/overview">Aditya Gupta</ext-link>, Thapar Institute of Engineering and Technology (Deemed to be University), India</p>
</fn>
</fn-group>
</back>
</article>