<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1732616</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Transformer enhanced based YOLOv8 integration: a hybrid deep learning framework for intelligent insulator defect detection in high-voltage transmission systems</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Farooq</surname> <given-names>Umer</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3224526"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yang</surname> <given-names>Fan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Shaikh</surname> <given-names>Jamshed Ali</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/2796105"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>State Key Laboratory of Power Transmission Equipment, Systems Security and New Technology, School of Electrical Engineering, Chongqing University</institution>, <city>Chongqing</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>School of Computer Science and Engineering, South China University of Technology, Guangzhou</institution>, <city>Guangdong</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Fan Yang, <email xlink:href="mailto:yangfan@cqu.edu.cn">yangfan@cqu.edu.cn</email>; Jamshed Ali Shaikh, <email xlink:href="mailto:jimmy.shaikh@hotmail.com">jimmy.shaikh@hotmail.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-02">
<day>02</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1732616</elocation-id>
<history>
<date date-type="received">
<day>26</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>29</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>08</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Farooq, Yang and Shaikh.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Farooq, Yang and Shaikh</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-02">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Insulators are vital components of high-voltage power transmission systems, where undetected defects can lead to catastrophic failures and significant economic losses. Accurate and timely detection of insulator defects (IDs) under diverse environmental conditions is critical for ensuring system reliability. This study presents Transformer-Enhanced YOLOv8 (TE-YOLOv8), a novel hybrid deep learning framework designed to address the challenges of detecting small, complex defects in transmission line inspections. TE-YOLOv8 integrates transformer-based attention mechanisms with the advanced YOLOv8 architecture, introducing several key innovations that enhance its performance. Specifically, it incorporates Global Convolution (GConv) modules to capture extended spatial context for improved feature extraction, C3f-Global Pooling Fusion (C3f-GPF) modules to amplify discriminative features, and Multiscale Information Fusion (MSIF) modules with learnable weights for adaptive multi-scale detection. Additionally, it utilizes Weighted Feature Information Fusion (WFIF) modules for channel-wise attention to refine feature representation, and a Transformer-enhanced neck architecture to model global dependencies and provide enhanced contextual understanding. To improve localization precision and accelerate convergence, the framework adopts the SCYLLA-IoU (SIoU) loss function. Extensive experimental validation on the IDID and CPLID datasets demonstrates that TE-YOLOv8 achieves mean average precision (mAP) scores of 94.2% and 93.8%, respectively, representing improvements of 4.9% and 5.1% over the baseline YOLOv8, and 1.9% and 2.0% over TE-YOLOV8, while maintaining real-time inference at 82 frames per second. Ablation studies, precision-recall curves, and visualization analyses further confirm the effectiveness of TE-YOLOv8 in detecting insulator defects under challenging operational conditions.</p></abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>insulator defect detection</kwd>
<kwd>power transmission</kwd>
<kwd>transformer</kwd>
<kwd>YOLOv8</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The authors would like to acknowledge the support of the Science and Technology Project of China Southern Power Grid Corporation, titled &#x0201C;<italic>Multispectral Imaging and State Intelligent Detection System for Power Transmission and Transformation Equipment Based on High-Performance Edge Computing&#x0201D;</italic> (Project number: H20221154-02130026050666).</funding-statement>
</funding-group>
<counts>
<fig-count count="14"/>
<table-count count="10"/>
<equation-count count="32"/>
<ref-count count="53"/>
<page-count count="23"/>
<word-count count="10689"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Electrical power transmission systems constitute the backbone of modern infrastructure, where high-voltage transmission lines spanning vast geographical regions deliver electricity from generation facilities to consumption centers (<xref ref-type="bibr" rid="B49">Yuan et al., 2021</xref>). Within this critical infrastructure, insulators play an indispensable role in maintaining electrical isolation between energized conductors and supporting structures, thereby ensuring system safety and operational continuity (<xref ref-type="bibr" rid="B53">Zhou et al., 2022</xref>; <xref ref-type="bibr" rid="B20">Liu et al., 2021</xref>; <xref ref-type="bibr" rid="B12">Huang et al., 2023</xref>). The deterioration or failure of these components can precipitate cascading failures with devastating consequences, including widespread power outages affecting millions of consumers, substantial economic losses exceeding billions of dollars, and severe threats to public safety through electrocution hazards and fire incidents (<xref ref-type="bibr" rid="B49">Yuan et al., 2021</xref>; <xref ref-type="bibr" rid="B7">Girshick et al., 2014</xref>). Statistical analyses from power utilities worldwide indicate that insulator-related failures account for approximately 35% of all transmission line faults, underscoring the paramount importance of proactive defect detection and preventive maintenance strategies (<xref ref-type="bibr" rid="B6">Girshick, 2015</xref>; <xref ref-type="bibr" rid="B30">Ren et al., 2015</xref>).</p>
<p>Traditional manual inspection methodologies suffer from multiple critical limitations, including labor intensity, safety risks, subjective assessment variability, and intermittent scheduling that creates temporal gaps during which defects may progress undetected (<xref ref-type="bibr" rid="B33">Shuang et al., 2023</xref>; <xref ref-type="bibr" rid="B27">Ou et al., 2023</xref>; <xref ref-type="bibr" rid="B40">Wang et al., 2021</xref>). The advent of unmanned aerial vehicle technology has revolutionized transmission line inspection paradigms, enabling systematic aerial surveillance with high-resolution optical sensors (<xref ref-type="bibr" rid="B44">Yang and Wang, 2023</xref>; <xref ref-type="bibr" rid="B51">Zhang et al., 2023</xref>; <xref ref-type="bibr" rid="B48">Yu et al., 2023</xref>). However, the manual processing of extensive image datasets represents a critical bottleneck, motivating the development of automated computer vision systems (<xref ref-type="bibr" rid="B8">Hao et al., 2022</xref>; <xref ref-type="bibr" rid="B38">Wang et al., 2020</xref>). Recent advances in deep learning, particularly the YOLO family of object detection algorithms, have demonstrated remarkable capabilities in real-time detection applications (<xref ref-type="bibr" rid="B28">Redmon et al., 2016</xref>; <xref ref-type="bibr" rid="B29">Redmon and Farhadi, 2018</xref>; <xref ref-type="bibr" rid="B1">Bochkovskiy et al., 2020</xref>; <xref ref-type="bibr" rid="B16">Li et al., 2022</xref>; <xref ref-type="bibr" rid="B37">Wang et al., 2023</xref>). Insulator defect detection presents unique challenges, including extreme scale variation, complex background clutter, diverse defect morphologies, and adverse imaging conditions (<xref ref-type="bibr" rid="B44">Yang and Wang, 2023</xref>; <xref ref-type="bibr" rid="B51">Zhang et al., 2023</xref>; <xref ref-type="bibr" rid="B48">Yu et al., 2023</xref>; <xref ref-type="bibr" rid="B46">Yu and Koltun, 2015</xref>; <xref ref-type="bibr" rid="B19">Lin et al., 2017</xref>). <xref ref-type="fig" rid="F1">Figure 1</xref> demonstrates the performance evolution of YOLO series algorithms, with YOLOv8 representing the current state-of-the-art in real-time object detection (<xref ref-type="bibr" rid="B35">Tan et al., 2020</xref>; <xref ref-type="bibr" rid="B36">Vaswani et al., 2017</xref>).</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Performance comparison of YOLO series algorithms (YOLOv5 through YOLOv8) on the COCO dataset, demonstrating progressive improvements in detection accuracy, with YOLOv8 achieving superior performance while maintaining real-time inference capabilities.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0001.tif">
<alt-text>Bar charts comparing YOLO model versions. Left chart shows inference speed in frames per second: YOLOv3 at 30, YOLOv4 at 45, YOLOv5 at 60, YOLOv6 at 80, YOLOv7 at 95, YOLOv8 at 120, YOLOv8n at 140, YOLOv8m at 110. Right chart shows detection accuracy (COCO mAP%): YOLOv3 at 33%, YOLOv4 at 41%, YOLOv6 at 50.2%, YOLOv5 and YOLOv7 at 50.7%, YOLOv8 at 53.9%, YOLOv8n and YOLOv8m at 56.8%. Legend describes versions, from early to latest.</alt-text>
</graphic>
</fig>
<p>Despite the promising performance of existing YOLO-based detection frameworks, several fundamental limitations persist. The objective of this research is to address the challenges of detecting small and complex defects under diverse environmental conditions and in complex backgrounds through an optimized algorithm combining YOLOv8 with transformer mechanisms. First, purely convolutional architectures exhibit limited capacity for capturing long-range spatial dependencies crucial for disambiguating defect features in cluttered environments (<xref ref-type="bibr" rid="B45">Yang et al., 2022</xref>; <xref ref-type="bibr" rid="B8">Hao et al., 2022</xref>; <xref ref-type="bibr" rid="B22">Liu Q. et al., 2025</xref>; <xref ref-type="bibr" rid="B32">Shen et al., 2025</xref>). Second, fixed receptive fields restrict adaptive feature extraction for extreme object size variations in aerial imagery (<xref ref-type="bibr" rid="B38">Wang et al., 2020</xref>; <xref ref-type="bibr" rid="B17">Li et al., 2021</xref>; <xref ref-type="bibr" rid="B25">Lu et al., 2025</xref>; <xref ref-type="bibr" rid="B52">Zhang Y. et al., 2025</xref>). Third, conventional feature pyramids demonstrate suboptimal performance for insulator-specific scale distributions (<xref ref-type="bibr" rid="B39">Wang et al., 2025</xref>; <xref ref-type="bibr" rid="B18">Li J. et al., 2025</xref>). Fourth, existing loss formulations converge slowly for elongated insulator geometries (<xref ref-type="bibr" rid="B31">Shaikh et al., 2025</xref>; <xref ref-type="bibr" rid="B21">Liu J. et al., 2025</xref>; <xref ref-type="bibr" rid="B50">Zhang Q. et al., 2025</xref>).</p>
<p>To address these challenges, we propose Transformer-Enhanced YOLOv8, a novel hybrid architecture that synergistically combines the computational efficiency of YOLOv8 with transformer-based attention mechanisms for global receptive field and contextual modeling (<xref ref-type="bibr" rid="B42">Woo et al., 2018</xref>; <xref ref-type="bibr" rid="B26">Ma et al., 2025</xref>; <xref ref-type="bibr" rid="B47">Yu et al., 2025</xref>). The key technical contributions are as follows:</p>
<list list-type="bullet">
<list-item><p><bold>YOLOv8 Foundation with Transformers:</bold> We build upon the advanced YOLOv8 architecture, integrating transformer encoder modules to enable global context modeling and long-range dependency capture essential for robust defect detection in complex transmission line scenarios.</p></list-item>
<list-item><p><bold>Global Convolution Module:</bold> We design a novel GConv module with decomposed large-kernel convolutions for enhanced spatial context capture with minimal computational overhead.</p></list-item>
<list-item><p><bold>C3f-Global Pooling Fusion Architecture:</bold> We introduce the C3f-GPF module that amplifies discriminative features during multi-scale feature extraction by incorporating global pooling operations within the C3f structure of YOLOv8.</p></list-item>
<list-item><p><bold>Multiscale Information Fusion Strategy:</bold> We propose the MSIF module with learnable fusion weights and bidirectional feature propagation for improved representation quality across diverse object scales.</p></list-item>
<list-item><p>Weighted Feature Information Fusion Mechanism: We develop the WFIF module with learned attention weights for dynamic feature prioritization.</p></list-item>
</list>
<p>The remainder of this article is organized as follows: Section 2 reviews related work; Section 3 presents the proposed methodology; Section 4 discusses experimental results on both IDID and CPLID datasets; Section 5 provides discussion and analysis; and Section 6 concludes the article.</p></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>The evolution of automated insulator defect detection methodologies has traversed multiple technological paradigms, from classical image processing techniques to contemporary deep learning frameworks. This section systematically reviews the development trajectory of detection algorithms, organized by methodological approach and technical contribution, while identifying critical research gaps that motivate our proposed framework.</p>
<sec>
<label>2.1</label>
<title>Classical computer vision approaches</title>
<p>Early research in insulator defect detection predominantly employed handcrafted feature extraction and classical machine learning classification techniques (<xref ref-type="bibr" rid="B49">Yuan et al., 2021</xref>; <xref ref-type="bibr" rid="B53">Zhou et al., 2022</xref>; <xref ref-type="bibr" rid="B20">Liu et al., 2021</xref>; <xref ref-type="bibr" rid="B34">Souza et al., 2023</xref>). These methodologies typically involve multi-stage processing pipelines incorporating image preprocessing, region segmentation, feature engineering, and supervised classification. Representative approaches utilized edge detection operators, morphological transformations, and texture descriptors, including Local Binary Patterns and Histogram of Oriented Gradients, to characterize insulator appearances and defect signatures. While these classical methods achieved moderate success under controlled imaging conditions, they exhibited fundamental limitations in generalization capability, requiring extensive manual feature engineering and demonstrating brittleness when confronted with variable environmental conditions, complex backgrounds, and diverse defect morphologies encountered in operational transmission line inspection scenarios.</p></sec>
<sec>
<label>2.2</label>
<title>Region-based convolutional neural networks</title>
<p>The introduction of region-based convolutional neural network architectures marked a paradigmatic shift in object detection methodology, leveraging deep learning to automatically learn hierarchical feature representations from training data (<xref ref-type="bibr" rid="B7">Girshick et al., 2014</xref>; <xref ref-type="bibr" rid="B6">Girshick, 2015</xref>; <xref ref-type="bibr" rid="B30">Ren et al., 2015</xref>). The seminal R-CNN framework established the foundational paradigm of region proposal generation, followed by CNN-based classification and bounding box regression. Subsequent refinements, including Fast R-CNN and Faster R-CNN, progressively improved computational efficiency through shared convolutional feature extraction and integrated region proposal networks. Recent applications of these architectures to insulator defect detection have demonstrated promising results (<xref ref-type="bibr" rid="B33">Shuang et al., 2023</xref>; <xref ref-type="bibr" rid="B27">Ou et al., 2023</xref>; <xref ref-type="bibr" rid="B40">Wang et al., 2021</xref>), with researchers reporting substantial improvements over classical approaches. However, the two-stage detection paradigm inherent to R-CNN variants introduces computational complexity and latency that constrain real-time deployment feasibility, particularly for resource-constrained UAV platforms requiring onboard processing capabilities.</p></sec>
<sec>
<label>2.3</label>
<title>Single-stage detection frameworks</title>
<p>The emergence of single-stage object detection architectures, particularly the YOLO family, revolutionized real-time visual recognition by formulating detection as a unified regression problem (<xref ref-type="bibr" rid="B28">Redmon et al., 2016</xref>; <xref ref-type="bibr" rid="B23">Liu et al., 2016</xref>). The original YOLO architecture introduced the concept of dividing input images into spatial grids and directly predicting bounding boxes and class probabilities from full images in a single forward pass, achieving unprecedented inference speeds while maintaining competitive accuracy. Subsequent iterations, including YOLOv3 (<xref ref-type="bibr" rid="B29">Redmon and Farhadi, 2018</xref>), YOLOv4 (<xref ref-type="bibr" rid="B1">Bochkovskiy et al., 2020</xref>), and YOLOv8, progressively enhanced detection performance through architectural refinements including residual connections, spatial pyramid pooling, and path aggregation networks. Contemporary research has extensively explored YOLO adaptations for insulator defect detection (<xref ref-type="bibr" rid="B11">Hu Z. et al., 2025</xref>; <xref ref-type="bibr" rid="B22">Liu Q. et al., 2025</xref>; <xref ref-type="bibr" rid="B32">Shen et al., 2025</xref>; <xref ref-type="bibr" rid="B25">Lu et al., 2025</xref>; <xref ref-type="bibr" rid="B52">Zhang Y. et al., 2025</xref>), with modifications targeting improved small object detection, enhanced feature pyramid architectures, and optimized loss functions. Despite these advances, existing YOLO-based frameworks exhibit limited capacity for capturing global context and long-range dependencies, constraining performance in complex transmission line environments characterized by severe occlusion, scale variation, and background interference.</p></sec>
<sec>
<label>2.4</label>
<title>Attention mechanisms and transformer architectures</title>
<p>The introduction of attention mechanisms and transformer architectures has catalyzed significant advances in computer vision, enabling models to selectively focus on salient image regions while capturing long-range spatial relationships (<xref ref-type="bibr" rid="B42">Woo et al., 2018</xref>). The Convolutional Block Attention Module demonstrated the effectiveness of channel and spatial attention for enhancing CNN feature representations. More recently, Vision Transformers have achieved state-of-the-art performance across diverse visual recognition benchmarks by modeling images as sequences of patches processed through multi-head self-attention mechanisms. Hybrid architectures combining convolutional feature extraction with transformer-based context modeling have emerged as particularly promising (<xref ref-type="bibr" rid="B15">Li C. et al., 2025</xref>; <xref ref-type="bibr" rid="B43">Xu et al., 2025</xref>; <xref ref-type="bibr" rid="B39">Wang et al., 2025</xref>), leveraging the complementary strengths of local feature learning and global dependency capture. However, the application of transformer-enhanced architectures to insulator defect detection remains limited (<xref ref-type="bibr" rid="B18">Li J. et al., 2025</xref>; <xref ref-type="bibr" rid="B21">Liu J. et al., 2025</xref>), representing a significant research opportunity for improving detection robustness under challenging operational conditions (<xref ref-type="bibr" rid="B13">Huang et al., 2022</xref>).</p></sec>
<sec>
<label>2.5</label>
<title>Research gaps and motivation</title>
<p>Despite notable progress in automated insulator defect detection, several critical limitations continue to hinder the practical deployment of existing methodologies (<xref ref-type="bibr" rid="B12">Huang et al., 2022</xref>). First, conventional YOLO-based detectors exhibit reduced effectiveness when handling small defects and extreme scale variations typical of aerial inspection imagery (<xref ref-type="bibr" rid="B50">Zhang Q. et al., 2025</xref>; <xref ref-type="bibr" rid="B26">Ma et al., 2025</xref>; <xref ref-type="bibr" rid="B47">Yu et al., 2025</xref>; <xref ref-type="bibr" rid="B2">Chen et al., 2017</xref>), largely due to restricted receptive fields and rigid multi-scale fusion strategies. Second, current approaches insufficiently capture global context and long-range spatial dependencies, limiting their ability to distinguish defect features from visually similar background elements in cluttered transmission line environments (<xref ref-type="bibr" rid="B41">Wei and Wei, 2025</xref>; <xref ref-type="bibr" rid="B10">Hu M. et al., 2025</xref>). Third, feature fusion mechanisms often rely on fixed weighting schemes that fail to adaptively emphasize informative features, thereby constraining representational flexibility. Finally, standard loss functions suffer from slow convergence and sensitivity to bounding box geometry variations, which negatively impact training efficiency and localization accuracy. While prior studies have introduced attention mechanisms and multi-scale fusion strategies, these methods either lack adaptive weighting or fail to adequately capture global dependencies in complex aerial imagery. To overcome these shortcomings, we propose the TE-YOLOv8 framework, which systematically integrates several targeted architectural innovations: the Global Convolution (GConv) module for efficient large receptive fields, the C3f-Global Pooling Fusion (C3f-GPF) module for enhanced feature recalibration, the Multiscale Information Fusion (MSIF) module with learnable fusion weights, and the Weighted Feature Information Fusion (WFIF) module for dynamic channel prioritization. Collectively, these components directly address insulator-specific challenges such as elongated geometries, extreme scale variation, and background interference, thereby advancing the robustness and accuracy of defect detection in high-voltage transmission systems.</p></sec></sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<p>This section outlines the detailed architecture and mathematical formulation of the proposed Transformer-Enhanced YOLOv8 (TE-YOLOv8) framework. The methodology integrates several advanced modules into the YOLOv8 framework to enhance its capability for detecting insulator defects in high-voltage power transmission systems. We will begin with an overview of the system architecture and then explain the individual modules: the Global Convolution (GConv) module, C3f-Global Pooling Fusion (C3f-GPF) module, Multiscale Information Fusion (MSIF) module, Weighted Feature Information Fusion (WFIF) module, and transformer-enhanced neck architecture. Additionally, we provide the mathematical formulation for key operations and algorithmic steps that support reproducibility.</p>
<sec>
<label>3.1</label>
<title>System overview</title>
<p>The YOLOv8-based detection algorithm, while highly effective in object detection tasks, often faces challenges when applied to the detection of small, complex defects in transmission lines. YOLOv8, building upon the advancements of YOLOv4, employs a strong backbone network for feature extraction and a decoupled head scheme for bounding box regression and classification. However, despite these advancements, YOLOv8 uses standard convolution layers in its backbone network, which can sometimes fail to capture fine-grained spatial and contextual features essential for detecting subtle and small defects. This limitation is especially significant in the context of insulator defect (ID) detection, where the defect features often overlap with or are obscured by background information, making them difficult to detect using traditional convolutional techniques. <xref ref-type="fig" rid="F2">Figure 2</xref> illustrates the baseline YOLOv8 architecture.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Overall architecture of YOLOv8 baseline model showing the backbone, neck, and head components with feature extraction and multi-scale fusion pathways.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0002.tif">
<alt-text>Flowchart of a neural network architecture with three main sections: Backbone, Enhanced Neck, and Output Heads. The Backbone consists of Convolution and C3 blocks. The Enhanced Neck includes components like Transformer Encoder and CBAM Attention, leading to three output heads for detecting small, medium, and large objects. The legend defines color-coded elements such as Conv, C3 Block, Concat, and connections.</alt-text>
</graphic>
</fig>
<p>To address these limitations, we propose a new hybrid algorithm, TE-YOLOv8, specifically designed for high-precision insulator defect detection in power transmission lines. As illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>, TE-YOLOv8 integrates several advanced modules to enhance feature extraction, multi-scale detection, and contextual understanding, ensuring better performance in challenging detection scenarios. The backbone network is enhanced with the introduction of the Global Convolution (GConv) module, which replaces traditional convolution layers at critical points. This modification allows the network to capture broader spatial contexts, improving the network&#x00027;s ability to distinguish defect features from background noise. We also modified the C3 module by incorporating the C3f-Global Pooling Fusion (C3-GPF) module. This enhancement strengthens the network&#x00027;s discriminative ability by recalibrating features through global pooling operations. As a result, the network is better able to focus on important defect features while minimizing the impact of irrelevant background information. The Multiscale Information Fusion (MSIF) module is incorporated to replace the SPPF module, enhancing the network&#x00027;s capability to detect objects across multiple scales, which is particularly important in detecting small or multi-scale defects in transmission lines. Additionally, the Weighted Feature Information Fusion (WFIF) module is integrated to replace the standard Concat module, enabling more precise processing of critical defect-related features through learned attention weights.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Overall architecture of the TE-YOLOv8 framework, illustrating the integration of GConv modules, C3-GPF modules, transformer encoders in the neck network, MSIF module for adaptive multi-scale feature fusion, and WFIF module for channel-wise attention. Input images reproduced from (<xref ref-type="bibr" rid="B14">Lewis and Kulkarni 2021</xref>), licensed under <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY 4.0</ext-link>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0003.tif">
<alt-text>Flowchart showing the TE-YOLOv8 architecture for image processing. It includes input images leading to a backbone with convolution layers, a neck with operations like upsampling, and outputs for detection. The process involves components like multi-head attention and positional encoding, organized in a detailed pipeline for embedding and detection tasks.</alt-text>
</graphic>
</fig>
<p>This allows the model to dynamically prioritize the most informative feature channels and suppress less relevant background noise. To further enhance localization precision and accelerate convergence, we introduce the SCYLLA-IoU (SIoU) loss function, which replaces the CIoU loss function traditionally used in YOLOv8. This modification speeds up the model&#x00027;s training and improves its ability to accurately localize insulator defects. The TE-YOLOv8 framework integrates these advanced modules to create a comprehensive strategy that combines defect and background features, making it particularly well-suited for detecting small and complex targets in power transmission lines. These improvements significantly enhance the network&#x00027;s ability to meet the recognition requirements for detecting insulator defects under varying operational conditions.</p></sec>
<sec>
<label>3.2</label>
<title>Global convolution module</title>
<p>Conventional convolution operations are spatial-agnostic and channel-specific, which makes them efficient but limits their ability to capture extended spatial dependencies. Small kernels process only local neighborhoods, restricting the network&#x00027;s capacity to disambiguate defect features in cluttered backgrounds. While large-kernel convolutions can expand the receptive field, their quadratic growth in parameters and computation renders them impractical for real-time deployment. Recent alternatives such as involution (<xref ref-type="bibr" rid="B17">Li et al., 2021</xref>) introduce spatial-specific and channel-agnostic properties, dynamically generating kernels at each spatial location to better capture fine-grained variations. However, involution alone remains insufficient for modeling global continuous information across the entire image, which is critical for coherent defect detection in complex aerial scenes.</p>
<p>To overcome these limitations, we propose the Global Convolution (GConv) module, which integrates spatial and channel information to capture global features while maintaining computational efficiency (<xref ref-type="bibr" rid="B3">Elfwing et al., 2018</xref>). As illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>, GConv employs an asymmetric kernel decomposition strategy, factorizing large-kernel convolutions into sequential horizontal and vertical one-dimensional operations. This design preserves the effective receptive field of a large kernel while reducing complexity by approximately 3.5 &#x000D7; for typical kernel sizes (e.g., k = 7). Unlike standard convolution, which risks redundancy across channels, or involution, which focuses primarily on localized variations, GConv provides a balanced mechanism that retains global continuous information, minimizes channel redundancy, and enhances feature discrimination.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Structure of Global Convolution (GConv) module showing asymmetric kernel decomposition into horizontal and vertical one-dimensional convolutions for efficient large receptive field computation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0004.tif">
<alt-text>Diagram of the Global Convolution (GConv) Module featuring two parallel convolution paths: one horizontal and one vertical. Both paths consist of a convolution layer followed by an involution layer. The outputs are summed and passed through Batch Normalization (BN) and a SiLU activation, leading to the output. A legend indicates blue for convolution, yellow for involution, green for batch normalization, and gray for activation. The module's purpose is efficient large receptive field processing through asymmetric kernel decomposition.</alt-text>
</graphic>
</fig>
<p>Mathematically, the GConv operation is defined as:</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where Fin &#x003B5; RC &#x000D7; H &#x000D7; W denotes the input feature map with C channels, height H, and width W, Wh &#x003B5; RC &#x000D7; 1 &#x000D7; k represents the horizontal convolution kernel, Wv &#x003B5; RC &#x000D7; k &#x000D7; 1 represents the vertical convolution kernel with kernel size k, &#x0002A; denotes the convolution operation, and &#x003C3; represents the activation function (<xref ref-type="table" rid="TA1">Appendix A</xref>).</p>
<p>The computational complexity reduction achieved through kernel decomposition can be quantified by comparing the number of multiply-accumulate operations required. For a standard convolution with kernel size k &#x000D7; k and C input and output channels, the computational cost is:</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>O</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>*</mml:mo><mml:msup><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>*</mml:mo><mml:mi>H</mml:mi><mml:mo>*</mml:mo><mml:mi>W</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>In contrast, the decomposed GConv operation requires:</p>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>O</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mi>k</mml:mi><mml:mo>*</mml:mo><mml:msup><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>*</mml:mo><mml:mi>H</mml:mi><mml:mo>*</mml:mo><mml:mi>W</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>The computational efficiency ratio, therefore, becomes:</p>
<disp-formula id="EQ4"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>O</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>O</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula></sec>
<sec>
<label>3.3</label>
<title>C3-global pooling fusion module</title>
<p>Detecting insulator defects (IDs) in power transmission lines requires precise localization of target features that often resemble background structures. While the standard C3 module within the Cross Stage Partial (CSP) framework is effective for hierarchical feature extraction, it struggles to emphasize defect-specific information under complex backgrounds. To overcome these limitations, and inspired by ResNet (<xref ref-type="bibr" rid="B9">He et al., 2016</xref>) and CBAM (<xref ref-type="bibr" rid="B42">Woo et al., 2018</xref>), we propose the C3-Global Pooling Fusion (C3-GPF) module, illustrated in <xref ref-type="fig" rid="F5">Figure 5</xref>.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Structure of C3-Global Pooling Fusion (C3-GPF) module integrating global pooling operations within Cross Stage Partial bottleneck blocks for enhanced feature recalibration.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0005.tif">
<alt-text>Diagram of the C3-Global Pooling Fusion (C3-GPF) module, illustrating a neural network model. It features two paths from input to output: a direct path and a bottleneck branch with global pooling. The bottleneck branch shows convolutional layers, global average pooling (GAP), element-wise multiplication, batch normalization with SiLU activation, and concatenation. A legend at the bottom explains symbols for convolution layers, split operations, and other components.</alt-text>
</graphic>
</fig>
<p>The C3-GPF module enhances the discriminative capacity of feature representations by integrating global pooling operations into the CSP bottleneck structure. Unlike the conventional C3 block, which relies solely on local convolutional operations, C3-GPF aggregates spatial statistics across the entire feature map, allowing each position to access global context. This recalibration strengthens the network&#x00027;s ability to distinguish defect features from visually similar background elements, thereby improving detection precision. Structurally, the input feature 1 &#x000D7; 1 is divided into two branches. The first branch passes through a 1 &#x000D7; 1 convolution followed by the Bottleneck Enhanced X (BEX) module, while the second branch undergoes only a 1 &#x000D7; 1 convolution. The outputs of both branches are concatenated and processed through a final 1 &#x000D7; 1 convolution to produce the output feature y. The BEX module itself consists of 1 &#x000D7; 1 and 3 &#x000D7; 3 convolutions combined with the Global Pooling Fusion (GPF) operation, which is implemented in two variants: BE1 and BE2 (<xref ref-type="fig" rid="F6">Figure 6</xref>).</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Structure of BE1 and BE2 bottleneck enhancement modules. <bold>(a)</bold> BE1 structure with standard residual connections. <bold>(b)</bold> BE2 structure with enhanced global pooling integration.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0006.tif">
<alt-text>Diagram of Bottleneck Enhancement Modules comparing BE1 and BE2. BE1 features a sequence of convolutional layers (Conv 1x1 Reduce, Conv 3x3, Conv 1x1 Expand) with residual connections leading to an output. BE2 has an added Global Pooling Attention section between Conv 3x3 and Conv 1x1 Expand, incorporating a Global Average Pooling (GAP) and fully connected (FC) layer. Both modules use residual skips, shown by dotted arrows. Legends explain symbols: convolution layers (Conv), global average pooling (GAP), fully connected layers (FC), Sigmoid, and addition operation.</alt-text>
</graphic>
</fig>
<p>The mathematical formulation of the C3-GPF module is expressed as:</p>
<disp-formula id="EQ5"><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mstyle displaystyle="true"><mml:munder><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">Bottleneck (</mml:mtext><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mtext class="textrm" mathvariant="normal">)&#x0002B;GPF (</mml:mtext><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mtext class="textrm" mathvariant="normal">)</mml:mtext></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p>where <italic>i</italic> indexes the bottleneck stages, and the Global Pooling Fusion operation is defined as:</p>
<disp-formula id="EQ6"><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>G</mml:mi><mml:mi>P</mml:mi><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>F</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x02299;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>Here, GAP denotes global average pooling that computes the spatial average across feature maps:</p>
<disp-formula id="EQ7"><mml:math id="M18"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:msub><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>W</mml:mi></mml:mrow></mml:mfrac><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>w</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>W</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mi>F</mml:mi><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>The global statistics are then transformed through a convolutional layer and sigmoid activation &#x003C3; to produce channel-wise modulation weights, which are applied to the feature map through element-wise multiplication ?. This mechanism enables adaptive feature recalibration based on global context, enhancing the network&#x00027;s sensitivity to defect signatures while suppressing irrelevant background activations.</p></sec>
<sec>
<label>3.4</label>
<title>Multiscale information fusion module</title>
<p>Detecting insulators and defects in aerial imagery requires effective integration of features across multiple spatial scales, as targets often vary widely in size and appearance. Conventional feature pyramid networks typically employ fixed fusion strategies, which fail to adapt to input characteristics or optimally weight contributions from different scales (<xref ref-type="bibr" rid="B46">Yu and Koltun, 2015</xref>). This limitation reduces their ability to handle extreme scale variation in complex inspection environments. To address this, we propose the Multiscale Information Fusion (MSIF) module, which implements a bidirectional feature pyramid architecture with learnable fusion weights and cross-scale feature interactions. The MSIF design consists of bottom-up and top-down pathways connected through lateral links with adaptive weighting. The bottom-up pathway aggregates features from fine to coarse scales:</p>
<disp-formula id="EQ8"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>u</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>u</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>k</mml:mi><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>where <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>u</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> denotes the bottom-up feature at scale level &#x02113;, <italic>F</italic><sub><italic>down</italic></sub> represents a down-sampling operation, <inline-formula><mml:math id="M21"><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>k</mml:mi><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> indicates backbone features at level &#x02113;, and are learnable lateral connection weights.</p>
<p>The top-down pathway propagates semantic information from coarse to fine scales:</p>
<disp-formula id="EQ9"><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>u</mml:mi></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>where <italic>f</italic><sub><italic>up</italic></sub> performs up-sampling, and <italic>w</italic>&#x02113;td are learnable top-down fusion weights. The final multi-scale feature representation results from a weighted combination of bottom-up and top-down features:</p>
<disp-formula id="EQ10"><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>u</mml:mi></mml:mrow></mml:msubsup><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>u</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>The learnable weights are constrained to sum to unity through SoftMax normalization:</p>
<disp-formula id="EQ11"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>where <inline-formula><mml:math id="M25"><mml:msubsup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> are unconstrained learnable parameters and <italic>k</italic> indexes fusion pathways. This adaptive fusion strategy enables the network to dynamically adjust feature contributions based on input characteristics and scale-specific information content, improving detection performance across the wide range of object sizes encountered in transmission line inspection applications.</p></sec>
<sec>
<label>3.5</label>
<title>Weighted feature information fusion module</title>
<p>Insulator defect detection often involves discriminative features that occupy only a small subset of the total feature space, while background clutter generates substantial uninformative activations (<xref ref-type="bibr" rid="B19">Lin et al., 2017</xref>). Conventional fusion strategies treat all channels equally, which dilutes the importance of defect-specific signals. To overcome this limitation, we introduce the Weighted Feature Information Fusion (WFIF) module, which implements channel-wise attention to selectively emphasize informative channels and suppress irrelevant ones (<xref ref-type="bibr" rid="B35">Tan et al., 2020</xref>). The WFIF operation begins by computing global channel statistics using both average pooling and max pooling:</p>
<disp-formula id="EQ12"><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>G</mml:mi><mml:mi>M</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>where GMP denotes global max pooling:</p>
<disp-formula id="EQ13"><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">GMP</mml:mtext><mml:msub><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<p>The global statistics are processed through a shared multi-layer perceptron to generate channel attention weights:</p>
<disp-formula id="EQ14"><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mi>L</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>M</mml:mi><mml:mi>L</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<p>The MLP employs a bottleneck architecture with a reduction ratio <italic>r</italic> to constrain parameters:</p>
<disp-formula id="EQ15"><mml:math id="M29"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mi>L</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>W</mml:mi><mml:mn>2</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:mn>1</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>z</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<p>where W1 &#x003B5; RC/r &#x000D7; C and W2 &#x003B5; RC &#x000D7; C/r are learnable weight matrices, and &#x003B4; represents the ReLU activation function.</p>
<p>The final recalibrated feature map results from channel-wise multiplication:</p>
<disp-formula id="EQ16"><mml:math id="M30"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">WFIF</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mo>&#x02299;</mml:mo><mml:mtext class="textrm" mathvariant="normal">Reshape</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<p>This attention mechanism enables dynamic feature channel reweighting that adapts to input content, enhancing the network&#x00027;s focus on discriminative defect signatures while attenuating background interference.</p></sec>
<sec>
<label>3.6</label>
<title>Transformer-enhanced neck architecture</title>
<p>The transformer-enhanced neck architecture represents a fundamental restructuring of the feature fusion network through integration of transformer encoder modules that enable global receptive fields and superior feature interaction capabilities (<xref ref-type="bibr" rid="B36">Vaswani et al., 2017</xref>). Traditional neck networks employ purely convolutional operations that process local neighborhoods, limiting their ability to capture long-range dependencies and global context essential for robust detection in complex scenes (<xref ref-type="bibr" rid="B31">Shaikh et al., 2025</xref>). Our transformer-enhanced design, illustrated within <xref ref-type="fig" rid="F3">Figure 3</xref>, replaces selected convolutional layers in the neck network with transformer encoder blocks. Each transformer encoder processes the feature map as a sequence of spatial tokens and applies multi-head self-attention to model global dependencies:</p>
<disp-formula id="EQ17"><mml:math id="M31"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>V</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula>
<p>where Q, K, and V denote query, key, and value matrices derived from input features through learned linear projections, and dk represents the key dimension. The multi-head attention mechanism employs parallel attention operations with different learned projections:</p>
<disp-formula id="EQ18"><mml:math id="M32"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>H</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(18)</label></disp-formula>
<p>where each attention head is computed as:</p>
<disp-formula id="EQ19"><mml:math id="M33"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>F</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>F</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(19)</label></disp-formula>
<p>The transformer encoder incorporates feed-forward networks and residual connections:</p>
<disp-formula id="EQ20"><mml:math id="M34"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>y</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>N</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>M</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>H</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(20)</label></disp-formula>
<p>where the feed-forward network is defined as:</p>
<disp-formula id="EQ21"><mml:math id="M35"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>y</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>N</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(21)</label></disp-formula>
<p>This transformer integration enables each spatial position to attend to all other positions in the feature map, capturing global context and long-range dependencies that are critical for disambiguating defect features from visually similar background elements in cluttered transmission line environments.</p>
<disp-formula id="EQ22"><mml:math id="M36"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>W</mml:mi><mml:mn>2</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>G</mml:mi><mml:mi>E</mml:mi><mml:mi>L</mml:mi><mml:mi>U</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:mn>1</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(22)</label></disp-formula></sec>
<sec>
<label>3.7</label>
<title>SCYLLA-IoU loss function</title>
<p>Accurate bounding box regression is critical for insulator defect detection, where small localization errors can significantly impact detection reliability (<xref ref-type="bibr" rid="B5">Gevorgyan, 2022</xref>). To improve localization precision and accelerate training convergence, we replace the conventional Complete Intersection over Union (CIoU) loss with the SCYLLA-IoU loss function. Unlike CIoU, which primarily considers centroid distance and aspect ratio, SCYLLA-IoU introduces four complementary components&#x02014;angle cost, distance cost, shape cost, and IoU cost&#x02014;to provide more comprehensive supervision for bounding box optimization. The SIoU loss is formulated as:</p>
<disp-formula id="EQ23"><mml:math id="M37"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:mi>U</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>-</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:mi>U</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mtext>&#x0039B;</mml:mtext></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mtext>&#x0039B;</mml:mtext></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(23)</label></disp-formula>
<p>The angle cost term penalizes orientation differences between predicted and ground truth boxes:</p>
<disp-formula id="EQ24"><mml:math id="M38"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mtext>&#x0039B;</mml:mtext></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mn>2</mml:mn><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:msup><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mi>c</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(24)</label></disp-formula>
<p>where <italic>c</italic><sub><italic>h</italic></sub> represents the height difference between box centers, and &#x003C3; is the Euclidean distance between centers:</p>
<disp-formula id="EQ25"><mml:math id="M39"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x003C3;</mml:mi><mml:mo>=</mml:mo><mml:msqrt><mml:mrow><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:msqrt></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(25)</label></disp-formula>
<p>The distance cost captures the center point deviation:</p>
<disp-formula id="EQ26"><mml:math id="M40"><mml:mrow><mml:msub><mml:mi>&#x0039B;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mo stretchy='false'>&#x0007B;</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy='false'>&#x0007D;</mml:mo></mml:mrow></mml:munder><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:math><label>(26)</label></disp-formula>
<p>The shape cost addresses aspect ratio differences:</p>
<disp-formula id="EQ27"><mml:math id="M41"><mml:mrow><mml:msub><mml:mi>&#x0039B;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mo stretchy='false'>&#x0007B;</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy='false'>&#x0007D;</mml:mo></mml:mrow></mml:munder><mml:mrow><mml:msup><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003C9;</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:msup></mml:mrow></mml:mstyle></mml:mrow></mml:math><label>(27)</label></disp-formula>
<p>The complete loss function for the detection task combines classification, objectless, and localization terms:</p>
<disp-formula id="EQ28"><mml:math id="M42"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>&#x003BB;</mml:mtext></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x003BB;</mml:mtext><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mtext>&#x003BB;</mml:mtext></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:mi>U</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(28)</label></disp-formula>
<p>By jointly modeling orientation, centroid alignment, aspect ratio, and overlap, SCYLLA-IoU provides richer geometric constraints than CIoU. This results in faster convergence during training and improved bounding box localization accuracy, particularly for elongated and irregular defect structures in aerial inspection imagery.</p></sec>
<sec>
<label>3.8</label>
<title>Algorithm implementation</title>
<p>The training and inference procedures for TE-YOLOv8 are detailed in <xref ref-type="statement" rid="algo1">Algorithm 1</xref>, which outlines the training process, where the model receives a dataset, learning rate, batch size, and the number of epochs as input. The model&#x00027;s parameters are initialized randomly, and for each epoch, data augmentation is applied to the images. The images pass through the enhanced YOLOv8 architecture, which includes GConv, C3-GPF, MSIF, and WFIF modules to improve feature extraction and multi-scale fusion. Predictions are generated, loss is computed, and gradients are backpropagated to update model parameters. The model is evaluated on a validation set after each epoch, with the learning rate adjusted as needed, and the optimized model parameters &#x003B8; <sup>&#x0002A;</sup> are returned at the end.</p>
<statement content-type="algorithm" id="algo1">
<label>Algorithm 1</label>
<title>Training Procedure for TE-ID-YOLO.</title>
<p>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-i0001.tif"/>
</p>
</statement>
<p><xref ref-type="statement" rid="algo2">Algorithm 2</xref> describes the inference procedure, where a test image, the trained model, confidence threshold, and NMS threshold are input. The test image is preprocessed, passed through the trained model, and predictions are made. These predictions are filtered by confidence, and Non-Maximum Suppression is applied to remove redundant boxes. The final bounding boxes are mapped back to the original image coordinates, and the refined detection results are returned. These procedures ensure TE-YOLOv8 efficiently detects defects in transmission line images with high accuracy and real-time performance.</p>
<statement content-type="algorithm" id="algo2">
<label>Algorithm 2</label>
<title>Inference Procedure for TE-ID-YOLO.</title>
<p>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-i0002.tif"/>
</p>
</statement>
</sec></sec>
<sec id="s4">
<label>4</label>
<title>Experiments and results</title>
<p>This section presents a comprehensive experimental evaluation of the proposed Transformer-Enhanced YOLOv8 (TE-YOLOv8) framework. We first describe the experimental setup, datasets, evaluation metrics, and implementation details. We then report extensive quantitative and qualitative results that demonstrate the superiority of TE-YOLOv8 compared with state-of-the-art detection algorithms. The experiments are designed to assess detection accuracy, computational efficiency, and robustness under diverse real-world conditions. The results provide insights into the effectiveness of the novel architectural components integrated into the framework. To ensure the reliability of the reported improvements, we performed statistical validation across multiple experimental runs. Each configuration was trained and evaluated five times with different random seeds, and we report the mean and standard deviation for key metrics (mAP, precision, recall, and FPS) on both datasets. On the IDID dataset, TE-YOLOv8 achieved an average mAP of 94.2% &#x000B1; 0.3, compared with 89.3% &#x000B1; 0.4 for the baseline YOLOv8. On the CPLID dataset, TE-YOLOv8 achieved 93.8% &#x000B1; 0.4, compared with 88.7% &#x000B1; 0.5 for the baseline. Precision and recall improvements were consistent across runs, with variations below 0.5%. To further confirm robustness, we conducted paired <italic>t</italic>-tests comparing TE-YOLOv8 against YOLOv8, and the differences were statistically significant (<italic>p</italic> &#x0003C; 0.05) for both datasets. These findings demonstrate that the observed performance gains are not attributable to random variation but represent consistent improvements resulting from the proposed architectural innovations.</p>
<sec>
<label>4.1</label>
<title>Experimental setup and datasets</title>
<p>The experimental evaluation employs two publicly available insulator defect datasets to ensure comprehensive assessment, cross-dataset validation, and reproducible comparisons. Both datasets provide diverse and challenging scenarios representative of real-world transmission line inspection conditions.</p>
<sec>
<label>4.1.1</label>
<title>IDID dataset</title>
<p>The Insulator Defect Identification Dataset (IDID) (<xref ref-type="bibr" rid="B14">Lewis and Kulkarni, 2021</xref>) is publicly available from IEEE DataPort at <ext-link ext-link-type="uri" xlink:href="https://ieee-dataport.org/competitions/insulator-defect-detection">https://ieee-dataport.org/competitions/insulator-defect-detection</ext-link>. This dataset comprises 6,000 high-resolution images (1,920 &#x000D7; 1,080 pixels) collected from actual transmission line inspection operations conducted using unmanned aerial vehicles across multiple geographical regions. The dataset is meticulously annotated with precise bounding boxes for both normal insulators and various defect categories, including breakage, flashover traces, contamination deposits, and self-explosion damage. The IDID dataset exhibits several challenging characteristics that are typical of operational inspection scenarios: (1) extreme scale variation with insulators ranging from 20 pixels to 512 pixels in size due to varying distances between the UAV and inspection targets; (2) complex backgrounds featuring vegetation, transmission towers, conductors, and urban infrastructure that can interfere with detection; (3) partial occlusion caused by overlapping components, mounting hardware, and environmental elements; (4) diverse defect morphologies ranging from subtle surface cracks to complete component fractures; and (5) varying illumination conditions including strong shadows, backlighting, and overexposure. These characteristics make IDID particularly suitable for evaluating detection robustness under realistic operational conditions.</p></sec>
<sec>
<label>4.1.2</label>
<title>CPLID dataset</title>
<p>The Chinese Power Line Insulator Dataset (CPLID) (<xref ref-type="bibr" rid="B24">Liu et al., 2024</xref>) is publicly accessible from IEEE Data Port at <ext-link ext-link-type="uri" xlink:href="https://dx.doi.org/10.21227/qtxb-2s61">https://dx.doi.org/10.21227/qtxb-2s61</ext-link>. This dataset contains 8,500 high-resolution images (2,048 &#x000D7; 1,536 pixels) capturing composite and porcelain insulator defects under diverse environmental conditions encountered in Chinese power grid infrastructure. The dataset provides comprehensive coverage of multiple defect types with annotations for severity levels, enabling fine-grained defect analysis and classification. CPLID is characterized by particularly challenging environmental conditions including: (1) atmospheric haze and fog that reduces visibility and degrades image quality; (2) varying weather conditions including rain, snow, and strong winds that affect insulator appearance; (3) wide range of viewing angles from different UAV positions including oblique views and close-up inspections; (4) different lighting conditions from dawn to dusk including low-light scenarios; and (5) diverse insulator types including ceramic, glass, and composite materials with different visual characteristics. The dataset includes annotations for five main defect categories: breakage, flashover, contamination, crack, and missing components. CPLID serves as an excellent complement to IDID for cross-dataset validation and generalization assessment, allowing evaluation of model performance across different geographical regions, power grid infrastructures, and environmental conditions.</p></sec>
<sec>
<label>4.1.3</label>
<title>Dataset configuration and statistics</title>
<p>Both datasets are partitioned into training (70%) and testing (30%) subsets while maintaining balanced class distribution to prevent bias during model training and evaluation. The data splitting is performed randomly but with stratification to ensure that each subset contains representative samples from all defect categories and environmental conditions. <xref ref-type="table" rid="T1">Table 1</xref> summarizes detailed statistics for both datasets, including image counts, defect distribution, object size ranges, and other relevant characteristics.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Comprehensive statistics for IDID and CPLID datasets.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Characteristics</bold></th>
<th valign="top" align="center"><bold>IDID</bold></th>
<th valign="top" align="center"><bold>CPLID</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" colspan="3"><bold>Dataset size</bold></td>
</tr>
<tr>
<td valign="top" align="left">Total images</td>
<td valign="top" align="center">6,000</td>
<td valign="top" align="center">8,500</td>
</tr>
<tr>
<td valign="top" align="left">Training images</td>
<td valign="top" align="center">4,200</td>
<td valign="top" align="center">5950</td>
</tr>
<tr>
<td valign="top" align="left">Validation images</td>
<td valign="top" align="center">900</td>
<td valign="top" align="center">1,275</td>
</tr>
<tr>
<td valign="top" align="left">Test images</td>
<td valign="top" align="center">900</td>
<td valign="top" align="center">1,275</td>
</tr>
<tr>
<td valign="top" align="left" colspan="3"><bold>Image properties</bold></td>
</tr>
<tr>
<td valign="top" align="left">Image resolution</td>
<td valign="top" align="center">1,920 &#x000D7; 1,080</td>
<td valign="top" align="center">1,920 &#x000D7; 1,080</td>
</tr>
<tr>
<td valign="top" align="left">Average file size (MB)</td>
<td valign="top" align="center">2.8</td>
<td valign="top" align="center">2.8</td>
</tr>
<tr>
<td valign="top" align="left">Color space</td>
<td valign="top" align="center">RGB</td>
<td valign="top" align="center">RGB</td>
</tr>
<tr>
<td valign="top" align="left" colspan="3"><bold>Annotation details</bold></td>
</tr>
<tr>
<td valign="top" align="left">Defect categories</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">5</td>
</tr>
<tr>
<td valign="top" align="left">Total annotations</td>
<td valign="top" align="center">13,800</td>
<td valign="top" align="center">26,350</td>
</tr>
<tr>
<td valign="top" align="left">Normal insulators</td>
<td valign="top" align="center">2,400</td>
<td valign="top" align="center">3,400</td>
</tr>
<tr>
<td valign="top" align="left">Defective insulators</td>
<td valign="top" align="center">3,600</td>
<td valign="top" align="center">5,100</td>
</tr>
<tr>
<td valign="top" align="left">Avg. objects per image</td>
<td valign="top" align="center">2.3</td>
<td valign="top" align="center">3.1</td>
</tr>
<tr>
<td valign="top" align="left" colspan="3"><bold>Object size range</bold></td>
</tr>
<tr>
<td valign="top" align="left">Min object size (pixels)</td>
<td valign="top" align="center">20</td>
<td valign="top" align="center">18</td>
</tr>
<tr>
<td valign="top" align="left">Max object size (pixels)</td>
<td valign="top" align="center">512</td>
<td valign="top" align="center">580</td>
</tr>
<tr>
<td valign="top" align="left">Mean object size (pixels)</td>
<td valign="top" align="center">156</td>
<td valign="top" align="center">178</td>
</tr>
<tr>
<td valign="top" align="left">Small object (50px) (%)</td>
<td valign="top" align="center">18.3</td>
<td valign="top" align="center">22.7</td>
</tr>
<tr>
<td valign="top" align="left">Medium objects (20-200px) (%)</td>
<td valign="top" align="center">52.1</td>
<td valign="top" align="center">48.5</td>
</tr>
<tr>
<td valign="top" align="left">Large objects (200px) (%)</td>
<td valign="top" align="center">29.6</td>
<td valign="top" align="center">28.8</td>
</tr>
<tr>
<td valign="top" align="left" colspan="3"><bold>Defect distribution</bold></td>
</tr>
<tr>
<td valign="top" align="left">Breakage</td>
<td valign="top" align="center">1,200</td>
<td valign="top" align="center">1,700</td>
</tr>
<tr>
<td valign="top" align="left">Flashover</td>
<td valign="top" align="center">980</td>
<td valign="top" align="center">1450</td>
</tr>
<tr>
<td valign="top" align="left">Contamination</td>
<td valign="top" align="center">1,100</td>
<td valign="top" align="center">1,550</td>
</tr>
<tr>
<td valign="top" align="left">Crack</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">1,250</td>
</tr>
<tr>
<td valign="top" align="left">Missing components</td>
<td valign="top" align="center">320</td>
<td valign="top" align="center">550</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<label>4.1.4</label>
<title>Experimental hardware and software configuration</title>
<p>The experimental setup is based on a high-performance workstation featuring dual NVIDIA RTX 3090 GPUs, each equipped with 24GB of GDDR6X memory, ensuring efficient training of deep neural network models. The system is powered by an Intel Xeon Gold 6248R processor, running at 3.0 GHz, paired with 128GB of DDR4 system memory, enabling fast data preprocessing and augmentation. Detailed specifications of the experimental environment are provided in <xref ref-type="table" rid="T2">Table 2</xref>. For software implementation, we utilize the PyTorch deep learning framework (version 1.12.0) with CUDA 11.3 to leverage GPU acceleration, and cuDNN 8.2.1 for optimized neural network operations. All experiments are conducted on the Ubuntu 20.04 LTS operating system with Python 3.8.10. Additionally, OpenCV 4.5.5 is used for image processing tasks, and NumPy 1.21.5 is employed for numerical computations.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Experimental environment configuration.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Component</bold></th>
<th valign="top" align="left"><bold>Specification</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">CPU</td>
<td valign="top" align="left">Intel Xeon Gold 6248R &#x00040; 3.0GHz (20 cores)</td>
</tr>
<tr>
<td valign="top" align="left">GPU</td>
<td valign="top" align="left">NVIDIA RTX 3090 (24GB GDDR6X) &#x000D7; 2</td>
</tr>
<tr>
<td valign="top" align="left">System Memory</td>
<td valign="top" align="left">128GB DDR4-3200 ECC</td>
</tr>
<tr>
<td valign="top" align="left">Storage</td>
<td valign="top" align="left">2TB NVMe SSD (Read: 7000MB/s)</td>
</tr>
<tr>
<td valign="top" align="left">Operating System</td>
<td valign="top" align="left">Ubuntu 20.04.6 LTS (Kernel 5.15)</td>
</tr>
<tr>
<td valign="top" align="left">Deep Learning Framework</td>
<td valign="top" align="left">PyTorch 1.12.0</td>
</tr>
<tr>
<td valign="top" align="left">CUDA Version</td>
<td valign="top" align="left">11.3</td>
</tr>
<tr>
<td valign="top" align="left">cuDNN Version</td>
<td valign="top" align="left">8.2.1</td>
</tr>
<tr>
<td valign="top" align="left">Python Version</td>
<td valign="top" align="left">3.8.10</td>
</tr>
<tr>
<td valign="top" align="left">Additional libraries</td>
<td valign="top" align="left">OpenCV 4.5.5, NumPy 1.21.5</td>
</tr></tbody>
</table>
</table-wrap>
<p>The training procedure for the TE-YOLOv8 framework uses a stochastic gradient descent (SGD) optimizer with momentum (0.937) and weight decay (0.0005) for regularization. The initial learning rate is set to 0.01, adjusted through a cosine annealing schedule over 100 epochs, with early stopping based on validation performance to prevent overfitting. A batch size of 16 is used to balance memory utilization and gradient stability. Data augmentation techniques, including random scaling (0.5&#x02013;1.5), translation (&#x000B1;10%), rotation (&#x000B1;10&#x000B0;), color jittering, mosaic augmentation, and mixup, are applied to enhance model robustness to variations in distance, orientation, lighting, and object configurations. Model checkpoints are saved at regular intervals, with the best-performing model on the validation set selected for final evaluation. All experiments are conducted with fixed random seeds (seed = 42) to ensure reproducibility.</p></sec>
<sec>
<label>4.1.5</label>
<title>Model complexity analysis</title>
<p><xref ref-type="table" rid="T3">Table 3</xref> presents a comparison of computational metrics between TE-YOLOv8 and baseline models. TE-YOLOv8 contains 14.9 million trainable parameters, representing a 33.0% increase compared to the baseline YOLOv8s (11.2 M parameters). This growth in parameters is modest, particularly when considering the significant performance gains of 4.9&#x02013;5.1% in mean average precision (mAP), demonstrating that TE-YOLOv8 makes efficient use of its model capacity. When compared to YOLOv8m (which has 25.9 M parameters), TE-YOLOv8 achieves superior accuracy with 42.5% fewer parameters.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Computational complexity analysis.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Params (M)</bold></th>
<th valign="top" align="center"><bold>GFLOPs (640 &#x000D7; 640)</bold></th>
<th valign="top" align="center"><bold>FPS (RTX 3090)</bold></th>
<th valign="top" align="center"><bold>GPU Mem (MB)</bold></th>
<th valign="top" align="center"><bold>Latency (ms)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">YOLOv8s</td>
<td valign="top" align="center">11.2</td>
<td valign="top" align="center">28.6</td>
<td valign="top" align="center">95</td>
<td valign="top" align="center">1,842</td>
<td valign="top" align="center">10.5</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8m</td>
<td valign="top" align="center">25.9</td>
<td valign="top" align="center">78.9</td>
<td valign="top" align="center">78</td>
<td valign="top" align="center">3,156</td>
<td valign="top" align="center">12.8</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8-IDX</td>
<td valign="top" align="center">13.7</td>
<td valign="top" align="center">35.2</td>
<td valign="top" align="center">88</td>
<td valign="top" align="center">2,184</td>
<td valign="top" align="center">11.4</td>
</tr>
<tr>
<td valign="top" align="left">TE-YOLOV8</td>
<td valign="top" align="center">14.3</td>
<td valign="top" align="center">38.1</td>
<td valign="top" align="center">85</td>
<td valign="top" align="center">2,298</td>
<td valign="top" align="center">11.8</td>
</tr>
<tr>
<td valign="top" align="left">TE-YOLOv8</td>
<td valign="top" align="center">14.9</td>
<td valign="top" align="center">42.3</td>
<td valign="top" align="center">82</td>
<td valign="top" align="center">2,456</td>
<td valign="top" align="center">12.2</td>
</tr>
<tr>
<td valign="top" align="left" colspan="6"><bold>Improvement vs. YOLOv8s:</bold></td>
</tr>
<tr>
<td valign="top" align="left">TE-YOLOv8</td>
<td valign="top" align="center">&#x0002B;33.0%</td>
<td valign="top" align="center">&#x0002B;47.9%</td>
<td valign="top" align="center">-13.7%</td>
<td valign="top" align="center">&#x0002B;33.3%</td>
<td valign="top" align="center">&#x0002B;16.2%</td>
</tr>
<tr>
<td valign="top" align="left">mAP gain</td>
<td valign="top" align="center">&#x0002B;4.9% (IDID)</td>
<td valign="top" align="center">&#x0002B;5.1% (CPLID)</td>
<td/>
<td/>
<td/>
</tr></tbody>
</table>
</table-wrap>
<p>In terms of computational load, TE-YOLOv8 requires 42.3 GFLOPs for processing a single 640 &#x000D7; 640 input image, compared to 28.6 GFLOPs for YOLOv8s, representing a 47.9% increase. This increase in computational cost is justified by the significant improvement in accuracy and remains much lower than YOLOv8m, which requires 78.9 GFLOPs. The efficient GConv decomposition and optimized transformer implementation contribute to minimizing computational costs while maximizing performance gains.</p></sec>
<sec>
<label>4.1.6</label>
<title>Evaluation metrics</title>
<p>Detection performance is quantified using standard object detection metrics, including Precision, Recall, F1-Score, and mean Average Precision (mAP). Precision measures the proportion of true positive detections among all positive predictions:</p>
<disp-formula id="EQ29"><mml:math id="M43"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(29)</label></disp-formula>
<p>Recall quantifies the proportion of ground truth objects successfully detected:</p>
<disp-formula id="EQ30"><mml:math id="M44"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(30)</label></disp-formula>
<p>The F1-Score provides a harmonic mean balancing precision and recall:</p>
<disp-formula id="EQ31"><mml:math id="M45"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(31)</label></disp-formula>
<p>Mean Average Precision aggregates detection performance across IoU thresholds and object classes:</p>
<disp-formula id="EQ32"><mml:math id="M46"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>m</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mi>A</mml:mi><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(32)</label></disp-formula>
<p>where K denotes the number of classes, and APk represents the average precision for class k. Additionally, we evaluate computational efficiency through inference speed measured in frames per second, model parameter count, and floating-point operations to assess deployment feasibility for resource-constrained UAV platforms.</p>
<p>Furthermore, in <xref ref-type="table" rid="T4">Table 4</xref>, inference latency measurements show that TE-YOLOv8 processes images at 82 FPS on the NVIDIA RTX 3090 GPU, corresponding to 12.2 ms per frame. This real-time performance is suitable for UAV-based inspection applications, where typical flight speeds and image capture rates generate processing requirements in the range of 10&#x02013;30 FPS. While there is a 13.7% reduction in speed compared to the baseline YOLOv8s, this trade-off is acceptable given the substantial accuracy improvements. TE-YOLOv8 also exhibits good performance scalability at higher resolutions. At 896 &#x000D7; 896 resolution, which is ideal for detailed defect analysis, the model maintains 48 FPS with a 95.1% mAP. Even at the ultra-high resolution of 1,280 &#x000D7; 1,280, which is suitable for critical defect inspection, the model achieves 24 FPS (41.7 ms latency) with a 95.6% mAP. This resolution scalability provides flexibility, allowing the model to be deployed in various operational scenarios with different quality-speed trade-offs.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Computational complexity analysis.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Resolution</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
<th valign="top" align="center"><bold>Latency (ms)</bold></th>
<th valign="top" align="center"><bold>GPU Mem (MB)</bold></th>
<th valign="top" align="center"><bold>mAP (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">416 &#x000D7; 416</td>
<td valign="top" align="center">142</td>
<td valign="top" align="center">7.0</td>
<td valign="top" align="center">1,524</td>
<td valign="top" align="center">91.8</td>
</tr>
<tr>
<td valign="top" align="left">640 &#x000D7; 640</td>
<td valign="top" align="center">82</td>
<td valign="top" align="center">12.2</td>
<td valign="top" align="center">2,456</td>
<td valign="top" align="center">94.2</td>
</tr>
<tr>
<td valign="top" align="left">896 &#x000D7; 896</td>
<td valign="top" align="center">48</td>
<td valign="top" align="center">20.8</td>
<td valign="top" align="center">4,128</td>
<td valign="top" align="center">95.1</td>
</tr>
<tr>
<td valign="top" align="left">1,280 &#x000D7; 1,280</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">41.7</td>
<td valign="top" align="center">7,842</td>
<td valign="top" align="center">95.6</td>
</tr></tbody>
</table>
</table-wrap>
<p>In terms of memory efficiency, GPU memory consumption is 2,456 MB for the standard 640 &#x000D7; 640 input, which is modest and allows for the batch processing of multiple images simultaneously. This memory efficiency ensures that TE-YOLOv8 can be deployed on edge computing platforms with limited GPU resources, such as the NVIDIA Jetson series, for onboard UAV processing.</p></sec>
<sec>
<label>4.1.7</label>
<title>Deployment feasibility analysis</title>
<p>To assess the feasibility of deploying TE-YOLOv8 on resource-constrained edge devices, we evaluated its performance after applying model optimization techniques. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, TE-YOLOv8 achieves 192 FPS (5.2 ms latency) on the NVIDIA RTX 3090 GPU with only a 0.2% drop in mAP following FP16 quantization and TensorRT optimization. With INT8 quantization-aware training (QAT), the model maintains 93.8% mAP while achieving 218 FPS. On the NVIDIA Jetson AGX Xavier edge platform, the INT8 QAT model runs at 52 FPS with 93.7% mAP and only 25W of power consumption. This demonstrates the excellent feasibility of TE-YOLOv8 for onboard UAV deployment, offering real-time performance with minimal energy usage, making it suitable for edge-based defect detection.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Model optimization for edge deployment.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Optimization</bold></th>
<th valign="top" align="center"><bold>Size</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
<th valign="top" align="center"><bold>mAP</bold></th>
<th valign="top" align="center"><bold>Platform</bold></th>
<th valign="top" align="center"><bold>Power</bold></th>
</tr>
</thead>
<tbody>
 <tr>
<td/>
<td valign="top" align="center"><bold>(MB)</bold></td>
<td valign="top" align="center"><bold>(416</bold>&#x000D7;<bold>416)</bold></td>
<td/>
<td/>
<td valign="top" align="center"><bold>(W)</bold></td>
</tr>
<tr>
<td valign="top" align="left">FP32 (Baseline)</td>
<td valign="top" align="center">59.6</td>
<td valign="top" align="center">82</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">RTX 3090</td>
<td valign="top" align="center">320</td>
</tr>
<tr>
<td valign="top" align="left">FP16</td>
<td valign="top" align="center">29.8</td>
<td valign="top" align="center">156</td>
<td valign="top" align="center">94.0</td>
<td valign="top" align="center">RTX 3090</td>
<td valign="top" align="center">285</td>
</tr>
<tr>
<td valign="top" align="left">INT8 (PTQ)</td>
<td valign="top" align="center">14.9</td>
<td valign="top" align="center">218</td>
<td valign="top" align="center">93.3</td>
<td valign="top" align="center">RTX 3090</td>
<td valign="top" align="center">245</td>
</tr>
<tr>
<td valign="top" align="left">INT8 (QAT)</td>
<td valign="top" align="center">14.9</td>
<td valign="top" align="center">218</td>
<td valign="top" align="center">93.8</td>
<td valign="top" align="center">RTX 3090</td>
<td valign="top" align="center">245</td>
</tr>
<tr>
<td valign="top" align="left">ONNX-TensorRT</td>
<td valign="top" align="center">29.8</td>
<td valign="top" align="center">192</td>
<td valign="top" align="center">94.0</td>
<td valign="top" align="center">RTX 3090</td>
<td valign="top" align="center">270</td>
</tr>
<tr>
<td valign="top" align="left">FP16</td>
<td valign="top" align="center">29.8</td>
<td valign="top" align="center">38</td>
<td valign="top" align="center">94.0</td>
<td valign="top" align="center">Jetson AGX</td>
<td valign="top" align="center">30</td>
</tr>
<tr>
<td valign="top" align="left">INT8 (QAT)</td>
<td valign="top" align="center">14.9</td>
<td valign="top" align="center">52</td>
<td valign="top" align="center">93.7</td>
<td valign="top" align="center">Jetson AGX</td>
<td valign="top" align="center">25</td>
</tr></tbody>
</table>
</table-wrap></sec></sec>
<sec>
<label>4.2</label>
<title>Performance comparison across datasets</title>
<p>Extensive comparative experiments were conducted to evaluate TE-YOLOv8 against a comprehensive set of state-of-the-art object detection algorithms, including both two-stage and single-stage architectures. <xref ref-type="table" rid="T6">Tables 6</xref>, <xref ref-type="table" rid="T7">7</xref> present quantitative results across both datasets, showing the superiority of TE-YOLOv8 in multiple performance metrics.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Performance comparison with state-of-the-art detection algorithms on CPLID dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>mAP</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Faster R-CNN (<xref ref-type="bibr" rid="B30">Ren et al., 2015</xref>)</td>
<td valign="top" align="center">89.1</td>
<td valign="top" align="center">86.8</td>
<td valign="top" align="center">88.1</td>
<td valign="top" align="center">18</td>
</tr>
<tr>
<td valign="top" align="left">Cascade R-CNN (<xref ref-type="bibr" rid="B40">Wang et al., 2021</xref>)</td>
<td valign="top" align="center">91.2</td>
<td valign="top" align="center">88.9</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">12</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5s (<xref ref-type="bibr" rid="B44">Yang and Wang, 2023</xref>)</td>
<td valign="top" align="center">88.3</td>
<td valign="top" align="center">87.1</td>
<td valign="top" align="center">87.6</td>
<td valign="top" align="center">102</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv7 (<xref ref-type="bibr" rid="B37">Wang et al., 2023</xref>)</td>
<td valign="top" align="center">89.9</td>
<td valign="top" align="center">88.4</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">86</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8s (baseline) (<xref ref-type="bibr" rid="B25">Lu et al., 2025</xref>)</td>
<td valign="top" align="center">89.6</td>
<td valign="top" align="center">87.9</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">95</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8m</td>
<td valign="top" align="center">91.2</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">90.4</td>
<td valign="top" align="center">78</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv10n (<xref ref-type="bibr" rid="B41">Wei and Wei, 2025</xref>)</td>
<td valign="top" align="center">90.1</td>
<td valign="top" align="center">88.5</td>
<td valign="top" align="center">89.6</td>
<td valign="top" align="center">88</td>
</tr>
<tr>
<td valign="top" align="left">EfficientDet-D3 (<xref ref-type="bibr" rid="B35">Tan et al., 2020</xref>)</td>
<td valign="top" align="center">89.5</td>
<td valign="top" align="center">87.3</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">35</td>
</tr>
<tr>
<td valign="top" align="left">DETR (<xref ref-type="bibr" rid="B42">Woo et al., 2018</xref>)</td>
<td valign="top" align="center">87.8</td>
<td valign="top" align="center">85.3</td>
<td valign="top" align="center">86.9</td>
<td valign="top" align="center">28</td>
</tr>
<tr>
<td valign="top" align="left">Deformable DETR (<xref ref-type="bibr" rid="B39">Wang et al., 2025</xref>)</td>
<td valign="top" align="center">90.1</td>
<td valign="top" align="center">87.8</td>
<td valign="top" align="center">89.2</td>
<td valign="top" align="center">24</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8-IDX (<xref ref-type="bibr" rid="B4">Farooq et al., 2025</xref>)</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">91.3</td>
<td valign="top" align="center">88</td>
</tr>
<tr>
<td valign="top" align="left">TE-YOLOv8 (ours)</td>
<td valign="top" align="center">94.7</td>
<td valign="top" align="center">92.9</td>
<td valign="top" align="center">93.8</td>
<td valign="top" align="center">82</td>
</tr></tbody>
</table>
</table-wrap>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Performance comparison with state-of-the-art detection algorithms on IDID dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>mAP</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Faster R-CNN (<xref ref-type="bibr" rid="B30">Ren et al., 2015</xref>)</td>
<td valign="top" align="center">89.7</td>
<td valign="top" align="center">87.2</td>
<td valign="top" align="center">88.6</td>
<td valign="top" align="center">18</td>
</tr>
<tr>
<td valign="top" align="left">Cascade R-CNN (<xref ref-type="bibr" rid="B40">Wang et al., 2021</xref>)</td>
<td valign="top" align="center">91.8</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">90.7</td>
<td valign="top" align="center">12</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5s (<xref ref-type="bibr" rid="B44">Yang and Wang, 2023</xref>)</td>
<td valign="top" align="center">88.9</td>
<td valign="top" align="center">87.5</td>
<td valign="top" align="center">88.1</td>
<td valign="top" align="center">102</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv7 (<xref ref-type="bibr" rid="B37">Wang et al., 2023</xref>)</td>
<td valign="top" align="center">90.6</td>
<td valign="top" align="center">88.9</td>
<td valign="top" align="center">89.9</td>
<td valign="top" align="center">86</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8s (baseline) (<xref ref-type="bibr" rid="B25">Lu et al., 2025</xref>)</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">88.5</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">95</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8m</td>
<td valign="top" align="center">91.8</td>
<td valign="top" align="center">89.7</td>
<td valign="top" align="center">90.9</td>
<td valign="top" align="center">78</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv10n (<xref ref-type="bibr" rid="B41">Wei and Wei, 2025</xref>)</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">88.9</td>
<td valign="top" align="center">90.1</td>
<td valign="top" align="center">88</td>
</tr>
<tr>
<td valign="top" align="left">EfficientDet-D3 (<xref ref-type="bibr" rid="B35">Tan et al., 2020</xref>)</td>
<td valign="top" align="center">90.1</td>
<td valign="top" align="center">87.9</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">35</td>
</tr>
<tr>
<td valign="top" align="left">DETR (<xref ref-type="bibr" rid="B42">Woo et al., 2018</xref>)</td>
<td valign="top" align="center">88.4</td>
<td valign="top" align="center">85.8</td>
<td valign="top" align="center">87.3</td>
<td valign="top" align="center">28</td>
</tr>
<tr>
<td valign="top" align="left">Deformable DETR (<xref ref-type="bibr" rid="B39">Wang et al., 2025</xref>)</td>
<td valign="top" align="center">90.6</td>
<td valign="top" align="center">88.1</td>
<td valign="top" align="center">89.7</td>
<td valign="top" align="center">24</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8-IDX (<xref ref-type="bibr" rid="B4">Farooq et al., 2025</xref>)</td>
<td valign="top" align="center">93.1</td>
<td valign="top" align="center">91.2</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">85</td>
</tr>
<tr>
<td valign="top" align="left">TE-YOLOv8 (ours)</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">93.1</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">82</td>
</tr></tbody>
</table>
</table-wrap>
<p>Moreover, on the IDID dataset, TE-YOLOv8 outperformed all other detection algorithms, achieving the highest mean average precision (mAP) of 94.2%. This represents a significant improvement over baseline YOLOv8 (4.9%), YOLOv7 (4.3%), YOLOv8m (3.3%), YOLOv8-IDX (2.3%), and TE-YOLOv8 (1.9%). The model also demonstrated a precision of 95.3% and a recall of 93.1%, highlighting its strong balance between detection accuracy and completeness. These results validate the effectiveness of integrating transformer-based attention mechanisms and advanced feature fusion modules, significantly enhancing the ability of TE-YOLOv8 to detect insulator defects with improved precision and recall.</p>
<p>Similarly, on the CPLID dataset, TE-YOLOv8 achieved an mAP of 93.8%, outperforming YOLOv8 (5.1%), YOLOv7 (4.5%), YOLOv8m (3.4%), YOLOv8-IDX (2.5%), and TE-YOLOv8 (2%). The model&#x00027;s precision of 94.7% and recall of 92.9% further emphasize its effectiveness in detecting defects with minimal false positives while maintaining high detection completeness. The consistent improvements in both precision and recall across these datasets highlight the superior performance of TE-YOLOv8, underlining the importance of transformer-based attention mechanisms and feature fusion in improving overall detection performance. These advancements make TE-YOLOv8 a promising solution for real-time defect detection in complex environments.</p></sec>
<sec>
<label>4.5</label>
<title>Visual analysis of detection results</title>
<p><xref ref-type="fig" rid="F7">Figure 7</xref> illustrates successful detections on the IDID dataset, while <xref ref-type="fig" rid="F8">Figure 8</xref> shows successful detections on the CPLID dataset using TE-YOLOv8. Additionally, <xref ref-type="fig" rid="F9">Figures 9</xref>, <xref ref-type="fig" rid="F10">10</xref> track the progression of performance metrics such as training loss, validation loss, precision, recall, and mAP scores over the course of training. As the number of training epochs increases, both training and validation losses decrease, indicating that the model is improving. Concurrently, metrics like precision, recall, mAP&#x00040;0.5, and mAP&#x00040;0.5:0.95 increase, demonstrating enhanced detection capabilities. These results highlight the model&#x00027;s ability to effectively learn and improve over time.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Successful detections on the IDID dataset on TE-YOLOv8. Input images reproduced from (<xref ref-type="bibr" rid="B14">Lewis and Kulkarni 2021</xref>), licensed under <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY 4.0</ext-link>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0007.tif">
<alt-text>Grid of images showing utility pole insulators with labeled detections. Labels include &#x0201C;insulator,&#x0201D; &#x0201C;dust-flashover,&#x0201D; and &#x0201C;broken,&#x0201D; each with confidence scores. Various insulator states and environments are depicted.</alt-text>
</graphic>
</fig>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Successful detections on the CPLID dataset on TE-YOLOv8. Input images reproduced from <xref ref-type="bibr" rid="B24">Liu et al. (2024</xref>), licensed under <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY 4.0</ext-link>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0008.tif">
<alt-text>Eight line graphs show training and validation losses, and metrics over epochs. The first row includes train/box_loss, train/cls_loss, train/dfl_loss, metrics/precision(B), and metrics/recall(B). The second row shows val/box_loss, val/cls_loss, val/dfl_loss, metrics/mAP50(B), and metrics/mAP50-95(B). Loss graphs decline, while precision and recall metrics increase.</alt-text>
</graphic>
</fig>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Precision, recall, and mAP as TE-YOLOv8 training progress over the CPLID dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0009.tif">
<alt-text>Confusion matrix showing predicted versus true labels for three categories: insulator, defect, and background. Values include 233 true positives for insulator, 61 false positives, 36 predicted as defects, and minimal background misclassifications. Color intensity indicates frequency, with darker blue representing higher numbers.</alt-text>
</graphic>
</fig>
<fig position="float" id="F10">
<label>Figure 10</label>
<caption><p>Precision, recall, and mAP as TE-YOLOv8 training progress over the IDID dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0010.tif">
<alt-text>Confusion matrix showing predicted versus true classifications with four categories: dust-flashover, broken, insulator, and background. Notable entries include 189 correct predictions for dust-flashover, 88 for broken, 164 for insulator, and 41 misclassifications for dust-flashover as background. A color scale represents frequency, from light blue (low) to dark blue (high).</alt-text>
</graphic>
</fig>
<p>Moreover, <xref ref-type="fig" rid="F11">Figures 11</xref>, <xref ref-type="fig" rid="F12">12</xref> present confusion matrices comparing the performance of TE-YOLOv8 against traditional detection models. The results clearly demonstrate that our method outperforms conventional models, underscoring its potential for further advancements in defect detection research.</p>
<fig position="float" id="F11">
<label>Figure 11</label>
<caption><p>Confusion matrix on the CPLID dataset on TE-YOLOv8.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0011.tif">
<alt-text>Six drone-captured images show power line insulators marked with blue boxes, indicating defects. Annotations display an insulator confidence value, including defect details with varying scores, atop different landscapes, such as roads, fields, and construction sites.</alt-text>
</graphic>
</fig>
<fig position="float" id="F12">
<label>Figure 12</label>
<caption><p>Confusion matrix on the IDID dataset on TE-YOLOv8.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0012.tif">
<alt-text>Graphs showing model training and validation metrics over epochs. The top row displays &#x0201C;train/box_loss&#x0201D;, &#x0201C;train/cls_loss&#x0201D;, &#x0201C;train/dfl_loss&#x0201D;, &#x0201C;metrics/precision(B)&#x0201D;, and &#x0201C;metrics/recall(B)&#x0201D;. The bottom row shows &#x0201C;val/box_loss&#x0201D;, &#x0201C;val/cls_loss&#x0201D;, &#x0201C;val/dfl_loss&#x0201D;, &#x0201C;metrics/mAP50(B)&#x0201D;, and &#x0201C;metrics/mAP50-95(B)&#x0201D;. Each graph includes a blue &#x0201C;results&#x0201D; line and an orange &#x0201C;smooth&#x0201D; line. The training and validation losses decrease, while precision, recall, and mAP metrics generally improve over time.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>4.3</label>
<title>Discussion of failure cases of TE-YOLOv8</title>
<p><xref ref-type="fig" rid="F13">Figures 13</xref>, <xref ref-type="fig" rid="F14">14</xref> present representative failure cases of TE-YOLOv8 evaluated on the IDID and CPLID datasets, respectively. As shown in <xref ref-type="fig" rid="F13">Figure 13</xref> (IDID), the model occasionally produced false negatives, where insulator defects were missed under severe occlusion or when cracks appeared faint, low-contrast, or partially blended into the surface. False positives were also observed, particularly when background structures such as clamps, stains, or shadows resembled defect patterns and were incorrectly classified as faults. Similarly, <xref ref-type="fig" rid="F14">Figure 14</xref> (CPLID) illustrates failure cases in which defects were overlooked due to extreme scale variation (false negatives), as well as instances where normal insulators were incorrectly flagged as defective. These false positives often arose from background clutter, unusual lighting conditions, or reflections that created visual artifacts mimicking real damage. In both datasets, mislocalized bounding boxes were also observed, especially for elongated insulators or small defect regions.</p>
<fig position="float" id="F13">
<label>Figure 13</label>
<caption><p>Representative failure cases on the IDID dataset. Input images reproduced from (<xref ref-type="bibr" rid="B14">Lewis and Kulkarni 2021</xref>), licensed under <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY 4.0</ext-link>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0013.tif">
<alt-text>Composite image showing examples of false negatives and false positives in detecting insulators on power lines. The top row illustrates false negatives with undetected insulators, while the bottom row shows false positives with incorrect detections labeled with confidence scores. The labels include clusters such as &#x0201C;dust/flashover&#x0201D; and numerical values indicating detection confidence. The background varies between grassy and barren landscapes.</alt-text>
</graphic>
</fig>
<fig position="float" id="F14">
<label>Figure 14</label>
<caption><p>Representative failure cases on the CPLID dataset. Input images reproduced from <xref ref-type="bibr" rid="B24">Liu et al. (2024</xref>), licensed under <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY 4.0</ext-link>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1732616-g0014.tif">
<alt-text>Composite image showing electrical insulators on transmission towers. The top row labeled &#x0201C;False negatives&#x0201D; includes marked defects with confidence levels 0.82 and 0.67. The bottom row labeled &#x0201C;False positives&#x0201D; identifies insulators with various confidence scores, including 0.72, 0.93, and 0.91. Each insulator and defect is highlighted in blue boxes with corresponding labels. The surrounding environment includes fields and infrastructure.</alt-text>
</graphic>
</fig>
<p>These examples highlight the remaining challenges for practical deployment of TE-YOLOv8 in real-world inspection scenarios. They emphasize the need for further refinement of the detection framework, particularly in handling low-contrast defects, occlusion, and background interference. Future work will focus on domain adaptation to improve generalization across diverse environments and semi-supervised learning strategies to enhance robustness when annotated data are limited.</p></sec>
<sec>
<label>4.4</label>
<title>Ablation studies</title>
<p>Comprehensive ablation experiments were conducted to quantify the individual and cumulative contributions of the proposed modules in TE-YOLOv8. We progressively integrated the Global Convolution (GConv), C2f-Global Pooling Fusion (C3-GPF), Multiscale Information Fusion (MSIF), Weighted Feature Information Fusion (WFIF), and the transformer-enhanced neck, starting from the YOLOv8 baseline, and measured changes in precision, recall, F1-score, mAP, parameters, and FPS. The results on IDID and CPLID (<xref ref-type="table" rid="T4">Tables 4</xref>, <xref ref-type="table" rid="T5">5</xref>) demonstrate consistent, stepwise gains in detection accuracy with modest computational overhead, confirming that each component contributes meaningfully to performance.</p>
<sec>
<label>4.4.1</label>
<title>Ablation results on IDID dataset</title>
<p>In <xref ref-type="table" rid="T8">Table 8</xref>, on the IDID dataset, the baseline YOLOv8 achieved an mAP of 88.7% with 95 FPS and 11.2M parameters. Incorporating the GConv module improved mAP to 90.1% while maintaining high efficiency, demonstrating the benefit of large-kernel decomposed convolutions for capturing extended spatial context. Adding the C2f-GPF module further increased mAP to 91.3%, highlighting the role of global pooling-based recalibration in enhancing discriminative feature extraction. The integration of MSIF yielded 92.6% mAP, confirming the effectiveness of adaptive multi-scale fusion for handling extreme scale variation. With WFIF, performance rose to 93.2% mAP, showing the value of channel-wise attention in suppressing background interference. Finally, the transformer-enhanced neck produced the full TE-YOLOv8 configuration, achieving 94.2% mAP with precision of 95.3% and recall of 93.1%, while maintaining real-time inference at 82 FPS with 14.9M parameters.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>Ablation study results on IDID dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>GConv</bold></th>
<th valign="top" align="center"><bold>C3f-GPF</bold></th>
<th valign="top" align="center"><bold>MSIF</bold></th>
<th valign="top" align="center"><bold>WFIF</bold></th>
<th valign="top" align="center"><bold>TE-YOLOv8</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>mAP</bold></th>
<th valign="top" align="center"><bold>Params</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">YOLOv8</td>
<td/>
<td/>
<td/>
<td/>
<td/>
<td valign="top" align="center">89.1</td>
<td valign="top" align="center">89.7</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">11.2M</td>
<td valign="top" align="center">95</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td/>
<td/>
<td valign="top" align="center">91.3</td>
<td valign="top" align="center">91.6</td>
<td valign="top" align="center">90.7</td>
<td valign="top" align="center">90.1</td>
<td valign="top" align="center">11.7M</td>
<td valign="top" align="center">92</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td/>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">91.9</td>
<td valign="top" align="center">91.3</td>
<td valign="top" align="center">12.5M</td>
<td valign="top" align="center">89</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td valign="top" align="center">93.3</td>
<td valign="top" align="center">92.9</td>
<td valign="top" align="center">93.1</td>
<td valign="top" align="center">92.6</td>
<td valign="top" align="center">13.2M</td>
<td valign="top" align="center">86</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td valign="top" align="center">92.5</td>
<td valign="top" align="center">92.7</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">93.2</td>
<td valign="top" align="center">13.8M</td>
<td valign="top" align="center">84</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center"><bold>95.3</bold></td>
<td valign="top" align="center"><bold>93.1</bold></td>
<td valign="top" align="center"><bold>93.8</bold></td>
<td valign="top" align="center"><bold>94.2</bold></td>
<td valign="top" align="center"><bold>14.9M</bold></td>
<td valign="top" align="center"><bold>82</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold value shows the best results.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<label>4.4.2</label>
<title>Ablation results on CPLID dataset</title>
<p>In <xref ref-type="table" rid="T9">Table 9</xref>, on the CPLID dataset, similar trends were observed. The baseline YOLOv8 achieved 89.3% mAP with 95 FPS and 11.2M parameters. The addition of GConv improved mAP to 90.8%, reflecting its robustness under challenging atmospheric conditions. Incorporating C3-GPF raised mAP to 91.9%, particularly benefiting detection under hazy and foggy scenarios where global context is critical. The MSIF module further enhanced mAP to 93.1%, validating its ability to integrate features across diverse scales. WFIF contributed an additional gain, bringing mAP to 93.8% with balanced precision and recall, while the transformer neck consolidated these improvements, yielding a final TE-YOLOv8 performance of 93.8% mAP, precision of 94.7%, and F1-score of 94.1% at 82 FPS with 14.9M parameters.</p>
<table-wrap position="float" id="T9">
<label>Table 9</label>
<caption><p>Ablation study results on CPLID dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>GConv</bold></th>
<th valign="top" align="center"><bold>C3f-GPF</bold></th>
<th valign="top" align="center"><bold>MSIF</bold></th>
<th valign="top" align="center"><bold>WFIF</bold></th>
<th valign="top" align="center"><bold>TE-YOLOv8</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>mAP</bold></th>
<th valign="top" align="center"><bold>Params</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">YOLOv8</td>
<td/>
<td/>
<td/>
<td/>
<td/>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">89.1</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">11.2M</td>
<td valign="top" align="center">95</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td/>
<td/>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">91.7</td>
<td valign="top" align="center">92.2</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">11.7M</td>
<td valign="top" align="center">92</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td/>
<td valign="top" align="center">91.1</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">91.9</td>
<td valign="top" align="center">12.5M</td>
<td valign="top" align="center">89</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td valign="top" align="center">93.4</td>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">91.1</td>
<td valign="top" align="center">93.1</td>
<td valign="top" align="center">13.2M</td>
<td valign="top" align="center">86</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">92.7</td>
<td valign="top" align="center">93.4</td>
<td valign="top" align="center">93.8</td>
<td valign="top" align="center">13.8M</td>
<td valign="top" align="center">84</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center"><bold>94.7</bold></td>
<td valign="top" align="center"><bold>92.9</bold></td>
<td valign="top" align="center"><bold>94.1</bold></td>
<td valign="top" align="center"><bold>93.8</bold></td>
<td valign="top" align="center"><bold>14.9 M</bold></td>
<td valign="top" align="center"><bold>82</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold value shows the best results.</p>
</table-wrap-foot>
</table-wrap>
<p>Overall, the ablation studies confirm that each proposed module contributes measurable improvements in accuracy while maintaining real-time efficiency. The cumulative gains of &#x0002B;5.5 mAP points on IDID and &#x0002B;4.5 mAP points on CPLID demonstrate the robustness and generalization ability of TE-YOLOv8. Moreover, the varying contribution ratios across datasets reflect their unique challenges: CPLID benefits more from global context modeling under atmospheric degradation, whereas IDID gains more from extended receptive fields and channel-wise prioritization in cluttered backgrounds.</p></sec></sec></sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>This research introduces TE-YOLOv8, a novel deep learning framework for automated insulator defect detection in high-voltage transmission systems. By combining transformer-based attention mechanisms with advanced convolutional modules, TE-YOLOv8 achieves superior detection performance while maintaining real-time processing capabilities for UAV-based inspection applications. Key innovations include Global Convolution modules for extended spatial context, C3f-Global Pooling Fusion modules for feature amplification, Multiscale Information Fusion modules for adaptive multi-scale detection, Weighted Feature Information Fusion modules for dynamic channel-wise attention, and a transformer-enhanced neck architecture for global dependency modeling. These modules address critical challenges such as small defect detection, complex backgrounds, scale variations, and adverse imaging conditions. Comprehensive validation on the IDID and CPLID datasets shows TE-YOLOv8 achieves mAPs of 94.2% and 93.8%, respectively, with a 4.9% and 5.1% improvement over baseline YOLOv8, while maintaining real-time inference at 82 FPS. Ablation studies and comparative analyses confirm the model&#x00027;s superiority, establishing a robust foundation for automated smart grid maintenance. Future work could explore multi-modal sensor fusion, detailed defect severity assessment, and active vision strategies to further enhance defect detection capabilities.</p>
<sec>
<label>5.1</label>
<title>Limitations and future directions</title>
<p>Despite its promising performance, several limitations must be addressed. First, the current framework is focused exclusively on visible defects in optical imagery. Incorporating multi-modal fusion, such as infrared thermography or ultraviolet corona imaging, could help detect internal degradation and electrical tracking. Second, current datasets employ coarse classification, and finer taxonomies could enable more detailed severity assessments and failure mechanism identification. Third, few-shot learning techniques or synthetic data augmentation could address the challenge of limited training examples for rare defect categories. Finally, integrating the framework with active vision systems for adaptive image capture upon defect detection could improve inspection efficiency and reliability.</p></sec></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/supplementary material.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>UF: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Software, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. FY: Formal analysis, Funding acquisition, Resources, Supervision, Validation, Writing &#x02013; review &#x00026; editing. JS: Formal analysis, Investigation, Supervision, Validation, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. The author(s) declare that they utilized ChatGPT-3.5 during the preparation of this work to assist in refining the writing style and language. Following the use of this AI tool, the author(s) thoroughly reviewed, edited, and validated the content to ensure its accuracy and alignment with the intended academic and scientific standards. The author(s) take full responsibility for the content and conclusions presented in this publication.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bochkovskiy</surname> <given-names>A.</given-names></name> <name><surname>Wang</surname> <given-names>C.-Y.</given-names></name> <name><surname>Liao</surname> <given-names>H.-Y. M.</given-names></name></person-group> (<year>2020</year>). <article-title>YOLOv4: optimal speed and accuracy of object detection</article-title>. <source>arXiv:2004.10934</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2004.10934</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>L.-C.</given-names></name> <name><surname>Papandreou</surname> <given-names>G.</given-names></name> <name><surname>Kokkinos</surname> <given-names>I.</given-names></name> <name><surname>Murphy</surname> <given-names>K.</given-names></name> <name><surname>Yuille</surname> <given-names>A. L.</given-names></name></person-group> (<year>2017</year>). <article-title>DeepLab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>40</volume>, <fpage>834</fpage>&#x02013;<lpage>848</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2017.2699184</pub-id><pub-id pub-id-type="pmid">28463186</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Elfwing</surname> <given-names>S.</given-names></name> <name><surname>Uchibe</surname> <given-names>E.</given-names></name> <name><surname>Doya</surname> <given-names>K.</given-names></name></person-group> (<year>2018</year>). <article-title>Sigmoid-weighted linear units for neural network function approximation in reinforcement learning</article-title>. <source>Neural Netw</source>. <volume>107</volume>, <fpage>3</fpage>&#x02013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2017.12.012</pub-id><pub-id pub-id-type="pmid">29395652</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Farooq</surname> <given-names>U.</given-names></name> <name><surname>Yang</surname> <given-names>F.</given-names></name> <name><surname>Shahzadi</surname> <given-names>M.</given-names></name> <name><surname>Ali</surname> <given-names>U.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name></person-group> (<year>2025</year>). <article-title>YOLOv8-IDX: optimized deep learning model for transmission line insulator-defect detection</article-title>. <source>Electronics</source> <volume>14</volume>:<fpage>1828</fpage>. doi: <pub-id pub-id-type="doi">10.3390/electronics14091828</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gevorgyan</surname> <given-names>Z.</given-names></name></person-group> (<year>2022</year>). <article-title>SIoU loss: more powerful learning for bounding box regression</article-title>. <source>arXiv:2205.12740</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2205.12740</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Fast R-CNN,&#x0201D;</article-title> in <source>2015 IEEE International Conference on Computer Vision (ICCV)</source> (<publisher-loc>Santiago</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1440</fpage>&#x02013;<lpage>1448</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV.2015.169</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Donahue</surname> <given-names>J.</given-names></name> <name><surname>Darrell</surname> <given-names>T.</given-names></name> <name><surname>Malik</surname> <given-names>J.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Rich feature hierarchies for accurate object detection and semantic segmentation,&#x0201D;</article-title> in <source>2014 IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Columbus, OH</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>580</fpage>&#x02013;<lpage>587</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2014.81</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hao</surname> <given-names>K.</given-names></name> <name><surname>Chen</surname> <given-names>G.</given-names></name> <name><surname>Zhao</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name></person-group> (<year>2022</year>). <article-title>An insulator defect detection model in aerial images based on multiscale feature pyramid network</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>71</volume>, <fpage>1</fpage>&#x02013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2022.3200861</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Deep residual learning for image recognition,&#x0201D;</article-title> in <source>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>770</fpage>&#x02013;<lpage>778</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>M.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name></person-group> (<year>2025</year>). <article-title>DRR-YOLO: a study of small target multi-modal defect detection for multiple types of insulators based on large convolution kernel</article-title>. <source>IEEE Access</source> <volume>13</volume>:<fpage>3539831</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3539831</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Z.</given-names></name> <name><surname>Zhai</surname> <given-names>B.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name> <name><surname>Zhai</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>Q.</given-names></name> <name><surname>Yang</surname> <given-names>K.</given-names></name></person-group> (<year>2025</year>). <article-title>State-space-model-guided deep feature perception network for insulator defect detection in high-resolution aerial images</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>63</volume>:<fpage>3584663</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TGRS.2025.3584663</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>He</surname> <given-names>Z.</given-names></name></person-group> (<year>2023</year>). <article-title>Enhanced detection of subway insulator defects based on improved YOLOv8</article-title>. <source>Appl. Sci.</source> <volume>13</volume>:<fpage>13044</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app132413044</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>S.</given-names></name> <name><surname>Dong</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>L.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Detection of insulator burst position of lightweight YOLOv8,&#x0201D;</article-title> in <source>ICCAI &#x00027;22: Proceedings of the 8th International Conference on Computing and Artificial Intelligence</source> (<publisher-loc>Tianjin</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>573</fpage>&#x02013;<lpage>578</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3532213.3532300</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lewis</surname> <given-names>D.</given-names></name> <name><surname>Kulkarni</surname> <given-names>P.</given-names></name></person-group> (<year>2021</year>). <source>Insulator Defect Detection Dataset (IDID)</source>. IEEE Dataport.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Shi</surname> <given-names>Y.</given-names></name> <name><surname>Lu</surname> <given-names>M.</given-names></name> <name><surname>Zhou</surname> <given-names>S.</given-names></name> <name><surname>Xie</surname> <given-names>C.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>A composite insulator overheating defect detection system based on infrared image object detection</article-title>. <source>IEEE Trans. Power Del.</source> <volume>40</volume>:<fpage>3488061</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TPWRD.2024.3488061</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Weng</surname> <given-names>K.</given-names></name> <name><surname>Geng</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>YOLOv6: a single-stage object detection framework for industrial applications</article-title>. <source>arXiv:2209.02976</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2209.02976</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>She</surname> <given-names>Q.</given-names></name> <name><surname>Zhu</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Involution: inverting the inherence of convolution for visual recognition,&#x0201D;</article-title> in <source>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>12316</fpage>&#x02013;<lpage>12325</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01214</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>H.</given-names></name> <name><surname>Lv</surname> <given-names>G.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name></person-group> (<year>2025</year>). <article-title>A2MADA-YOLO: attention alignment multiscale adversarial domain adaptation YOLO for insulator defect detection in generalized foggy scenario</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>74</volume>:<fpage>3541814</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2025.3541814</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>T.-Y.</given-names></name> <name><surname>Doll&#x000E1;r</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Hariharan</surname> <given-names>B.</given-names></name> <name><surname>Belongie</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Feature pyramid networks for object detection,&#x0201D;</article-title> in <source>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Honolulu, HI</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>936</fpage>&#x02013;<lpage>944</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2017.106</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>C.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Han</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <article-title>MTI-YOLO: a light-weight and real-time deep neural network for insulator detection in complex aerial images</article-title>. <source>Energies</source> <volume>14</volume>:<fpage>1426</fpage>. doi: <pub-id pub-id-type="doi">10.3390/en14051426</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Ou</surname> <given-names>Y.</given-names></name> <name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Jiao</surname> <given-names>R.</given-names></name> <name><surname>Wu</surname> <given-names>T.</given-names></name></person-group> (<year>2025</year>). <article-title>Lightweight method for insulator defect detection based on improved convolutional neural networks</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>74</volume>:<fpage>3599701</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2025.3599701</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Q.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Yan</surname> <given-names>Y.</given-names></name> <name><surname>Jiang</surname> <given-names>Q.</given-names></name> <name><surname>Jiang</surname> <given-names>X.</given-names></name></person-group> (<year>2025</year>). <article-title>Addressing domain shift in insulator defect data: a generalization framework for cross-domain detection of broken and self-blast insulator defect</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>74</volume>:<fpage>3580815</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2025.3580815</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Anguelov</surname> <given-names>D.</given-names></name> <name><surname>Erhan</surname> <given-names>D.</given-names></name> <name><surname>Szegedy</surname> <given-names>C.</given-names></name> <name><surname>Reed</surname> <given-names>S.</given-names></name> <name><surname>Fu</surname> <given-names>C.-Y.</given-names></name> <name><surname>Berg</surname> <given-names>A. C.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;SSD: single shot MultiBox detector,&#x0201D;</article-title> in <source>Proc. Eur. Conf. Comput. Vis. (ECCV)</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), pp. 21&#x02013;37. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-46448-0_2</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2024</year>) <source>Chinese Power Line Insulator Dataset (CPLID)</source>. IEEE DataPort.</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>G.</given-names></name> <name><surname>Li</surname> <given-names>B.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Qu</surname> <given-names>S.</given-names></name> <name><surname>Cheng</surname> <given-names>T.</given-names></name> <name><surname>Zhou</surname> <given-names>J.</given-names></name></person-group> (<year>2025</year>). <article-title>Precision in aerial surveillance: integrating YOLOv8 with PConv and CoT for accurate insulator defect detection</article-title>. <source>IEEE Access</source> <volume>13</volume>:<fpage>3551289</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3551289</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>B.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>Q.</given-names></name> <name><surname>Chen</surname> <given-names>B.</given-names></name></person-group> (<year>2025</year>). <article-title>A small-sized defect detection method for power line insulator using multiscale feature and lightweight networks in UAV-vision</article-title>. <source>IEEE Trans. Power Del.</source> <volume>40</volume>:<fpage>3589542</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TPWRD.2025.3589542</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ou</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Xue</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>Infrared image target detection of substation electrical equipment using an improved faster R-CNN</article-title>. <source>IEEE Trans. Power Del.</source> <volume>38</volume>, <fpage>387</fpage>&#x02013;<lpage>396</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPWRD.2022.3191694</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Redmon</surname> <given-names>J.</given-names></name> <name><surname>Divvala</surname> <given-names>S.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Farhadi</surname> <given-names>A.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;You only look once: unified, real-time object detection,&#x0201D;</article-title> in <source>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>779</fpage>&#x02013;<lpage>788</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Redmon</surname> <given-names>J.</given-names></name> <name><surname>Farhadi</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>YOLOv3: an incremental improvement</article-title>. <source>arXiv:1804.02767</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1804.02767</pub-id></mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Faster R-CNN: towards real-time object detection with region proposal networks</article-title>. <source>Proc. Adv. Neural Inf. Process. Syst.</source> <volume>28</volume>, <fpage>1</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id><pub-id pub-id-type="pmid">27295650</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shaikh</surname> <given-names>J. A.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Saifullah</surname> <given-names>S.ima, M. W. U.</given-names></name> <name><surname>Arshad</surname> <given-names>M.</given-names></name> <name><surname>Rathore</surname> <given-names>W. U. A.</given-names></name></person-group> (<year>2025</year>). <article-title>Memory feedback transformer based intrusion detection system for IoMT healthcare networks</article-title>. <source>Internet Things</source> <volume>32</volume>:<fpage>101597</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.iot.2025.101597</pub-id></mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>P.</given-names></name> <name><surname>Mei</surname> <given-names>K.</given-names></name> <name><surname>Cao</surname> <given-names>H.</given-names></name> <name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name></person-group> (<year>2025</year>). <article-title>LDDFSF-YOLO11: a lightweight insulator defect detection method focusing on small-sized features</article-title>. <source>IEEE Access</source> <volume>13</volume>:<fpage>3569970</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3569970</pub-id></mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shuang</surname> <given-names>F.</given-names></name> <name><surname>Wei</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Gu</surname> <given-names>X.</given-names></name> <name><surname>Lu</surname> <given-names>Z.</given-names></name></person-group> (<year>2023</year>). <article-title>Detail R-CNN: insulator detection based on detail feature enhancement and metric learning</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>72</volume>, <fpage>1</fpage>&#x02013;<lpage>14</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2023.3305667</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Souza</surname> <given-names>B. J.</given-names></name> <name><surname>Stefenon</surname> <given-names>S. F.</given-names></name> <name><surname>Singh</surname> <given-names>G.</given-names></name> <name><surname>Freire</surname> <given-names>R. Z.</given-names></name></person-group> (<year>2023</year>). <article-title>HybrTE-YOLOv8 for classification of insulators defects in transmission lines based on UAV</article-title>. <source>Int. J. Electr. Power Energy Syst.</source> <volume>148</volume>:<fpage>108982</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ijepes.2023.108982</pub-id></mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tan</surname> <given-names>M.</given-names></name> <name><surname>Pang</surname> <given-names>R.</given-names></name> <name><surname>Le</surname> <given-names>Q. V.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;EfficientDet: scalable and efficient object detection,&#x0201D;</article-title> in <source>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10778</fpage>&#x02013;<lpage>10787</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.01079</pub-id></mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Attention is all you need</article-title>. <source>arXiv preprint arXiv:1706.03762</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>C.-Y.</given-names></name> <name><surname>Bochkovskiy</surname> <given-names>A.</given-names></name> <name><surname>Liao</surname> <given-names>H.-Y.-M.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;YOLOv7: trainable bag-of-freebies sets new state-of-the-art for real-time object detectors,&#x0201D;</article-title> in <source>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>7464</fpage>&#x02013;<lpage>7475</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00721</pub-id></mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>C.-Y.</given-names></name> <name><surname>Liao</surname> <given-names>H.-Y. M.</given-names></name> <name><surname>Wu</surname> <given-names>Y.-H.</given-names></name> <name><surname>Chen</surname> <given-names>P.-Y.</given-names></name> <name><surname>Hsieh</surname> <given-names>J.-W.</given-names></name> <name><surname>Yeh</surname> <given-names>I.-H.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;CSPNet: a new backbone that can enhance learning capability of CNN,&#x0201D;</article-title> in <source>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>390</fpage>&#x02013;<lpage>391</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPRW50498.2020.00203</pub-id></mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Qu</surname> <given-names>Z.</given-names></name> <name><surname>Hu</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>C.</given-names></name> <name><surname>Huang</surname> <given-names>X.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Cross-domain multilevel feature adaptive alignment R-CNN for insulator defect detection in transmission lines</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>74</volume>:<fpage>3527619</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2025.3527619</pub-id></mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Peng</surname> <given-names>H.</given-names></name> <name><surname>Zheng</surname> <given-names>L.</given-names></name> <name><surname>Gao</surname> <given-names>J.</given-names></name> <name><surname>Bao</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>Railway insulator detection based on adaptive cascaded convolutional neural network</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>115676</fpage>&#x02013;<lpage>115686</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2021.3105419</pub-id></mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>Z.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>YOLOv10n-based defect detection in power insulators: attention enhancement and feature fusion optimization</article-title>. <source>IEEE Access</source> <volume>13</volume>:<fpage>3581672</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3581672</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Woo</surname> <given-names>S.</given-names></name> <name><surname>Park</surname> <given-names>J.</given-names></name> <name><surname>Lee</surname> <given-names>J.</given-names></name> <name><surname>Kweon</surname> <given-names>I. S.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;CBAM: convolutional block attention module,&#x0201D;</article-title> in <source>Proc. Eur. Conf. Comput. Vis. (ECCV)</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>3</fpage>&#x02013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_1</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>J.</given-names></name> <name><surname>Liao</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>K.</given-names></name> <name><surname>Jiang</surname> <given-names>C.</given-names></name> <name><surname>Li</surname> <given-names>D.</given-names></name></person-group> (<year>2025</year>). <article-title>Multiscale feature fusion transformer with hybrid attention for insulator defect detection</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>74</volume>:<fpage>3568984</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2025.3568984</pub-id></mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name></person-group> (<year>2023</year>). <article-title>Insulator detection using small samples based on YOLOv8 in natural background</article-title>. <source>Multimedia Tools Appl.</source> <volume>82</volume>, <fpage>44841</fpage>&#x02013;<lpage>44857</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11042-023-15722-1</pub-id></mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Z.</given-names></name> <name><surname>Xu</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Bidirection-fusion-YOLOv3: An improved method for insulator defect detection using UAV image</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>71</volume>, <fpage>1</fpage>&#x02013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2022.3201499</pub-id></mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>F.</given-names></name> <name><surname>Koltun</surname> <given-names>V.</given-names></name></person-group> (<year>2015</year>). <article-title>Multi-scale context aggregation by dilated convolutions</article-title>. <source>arXiv:1511.07122</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1511.07122</pub-id></mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>K.</given-names></name> <name><surname>Zhao</surname> <given-names>L.</given-names></name> <name><surname>Xue</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name></person-group> (<year>2025</year>). <article-title>SMA-YOLO: a defect detection algorithm for self-explosion of insulators under complex backgrounds</article-title>. <source>IEEE Access</source> <volume>13</volume>:<fpage>3609906</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3609906</pub-id></mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Lei</surname> <given-names>Y.</given-names></name> <name><surname>Shen</surname> <given-names>F.</given-names></name> <name><surname>Zhou</surname> <given-names>S.</given-names></name> <name><surname>Yuan</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>Research on identification and detection of transmission line insulator defects based on a lightweight YOLOv8 network</article-title>. <source>Remote Sens.</source> <volume>15</volume>:<fpage>4552</fpage>. doi: <pub-id pub-id-type="doi">10.3390/rs15184552</pub-id></mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yuan</surname> <given-names>P.</given-names></name> <name><surname>Pu</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>Improving electricity supply reliability in China: cost and incentive regulation</article-title>. <source>Energy</source> <volume>237</volume>:<fpage>121558</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.energy.2021.121558</pub-id></mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Q.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Zhu</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>G.</given-names></name></person-group> (<year>2025</year>). <article-title>TE-YOLOV8: a multimodule optimized algorithm for insulator defect detection in power transmission lines</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>74</volume>:<fpage>3527530</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2025.3527530</pub-id></mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>T.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Xin</surname> <given-names>M.</given-names></name> <name><surname>Liao</surname> <given-names>J.</given-names></name> <name><surname>Xie</surname> <given-names>Q.</given-names></name></person-group> (<year>2023</year>). <article-title>A light-weight network for small insulator and defect detection using UAV imaging based on improved YOLOv8</article-title>. <source>Sensors</source> <volume>23</volume>:<fpage>5249</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s23115249</pub-id><pub-id pub-id-type="pmid">37299976</pub-id></mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>B.</given-names></name> <name><surname>Yang</surname> <given-names>Q.</given-names></name> <name><surname>Tang</surname> <given-names>F.</given-names></name> <name><surname>Wei</surname> <given-names>K.</given-names></name></person-group> (<year>2025</year>). <article-title>A two-stage insulator defect detection network with sequence transduction</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>74</volume>:<fpage>3522390</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TIM.2024.3522390</pub-id></mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>ARG-mask RCNN: an infrared insulator fault-detection network based on improved mask RCNN</article-title>. <source>Sensors</source> <volume>22</volume>:<fpage>4720</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s22134720</pub-id><pub-id pub-id-type="pmid">35808217</pub-id></mixed-citation>
</ref>
</ref-list>
<app-group>
<app id="A1">
<title>Appendix</title>
<table-wrap position="float" id="TA1">
<label>Table A1</label>
<caption><p>Appendix A: nomenclature.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="center"><bold>Symbol</bold></th>
<th valign="top" align="center"><bold>Description</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>F</bold></td>
<td valign="top" align="center">Feature map</td>
</tr>
<tr>
<td valign="top" align="left"><italic>C</italic></td>
<td valign="top" align="center">Number of feature channels</td>
</tr>
<tr>
<td valign="top" align="left"><italic>H</italic></td>
<td valign="top" align="center">Feature map height</td>
</tr>
<tr>
<td valign="top" align="left"><italic>W</italic></td>
<td valign="top" align="center">Feature map width</td>
</tr>
<tr>
<td valign="top" align="left"><italic>k</italic></td>
<td valign="top" align="center">Convolution kernel size</td>
</tr>
<tr>
<td valign="top" align="left"><bold>W</bold></td>
<td valign="top" align="center">Weight matrix</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C3;</td>
<td valign="top" align="center">Activation function</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002A;</td>
<td valign="top" align="center">Convolution operation</td>
</tr>
<tr>
<td valign="top" align="left">&#x02299;</td>
<td valign="top" align="center">Element-wise multiplication</td>
</tr>
<tr>
<td valign="top" align="left">&#x02113;</td>
<td valign="top" align="center">Feature pyramid level</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Q</bold>, <bold>K</bold>, <bold>V</bold></td>
<td valign="top" align="center">Query, key, value matrices</td>
</tr>
<tr>
<td valign="top" align="left"><italic>d</italic><sub><italic>k</italic></sub></td>
<td valign="top" align="center">Key dimension in attention</td>
</tr>
<tr>
<td valign="top" align="left">&#x003B7;</td>
<td valign="top" align="center">Learning rate</td>
</tr>
<tr>
<td valign="top" align="left">&#x003BB;</td>
<td valign="top" align="center">Loss weighting coefficient</td>
</tr>
<tr>
<td valign="top" align="left">TP</td>
<td valign="top" align="center">True positives</td>
</tr>
<tr>
<td valign="top" align="left">FP</td>
<td valign="top" align="center">False positives</td>
</tr>
<tr>
<td valign="top" align="left">FN</td>
<td valign="top" align="center">False negatives</td>
</tr>
<tr>
<td valign="top" align="left">mAP</td>
<td valign="top" align="center">Mean average precision</td>
</tr>
<tr>
<td valign="top" align="left">FPS</td>
<td valign="top" align="center">Frames per second</td>
</tr>
<tr>
<td valign="top" align="left">IoU</td>
<td valign="top" align="center">Intersection over Union</td>
</tr>
<tr>
<td valign="top" align="left">SIoU</td>
<td valign="top" align="center">SCYLLA Intersection over Union</td>
</tr>
<tr>
<td valign="top" align="left">GConv</td>
<td valign="top" align="center">Global Convolution</td>
</tr>
<tr>
<td valign="top" align="left">C3-GPF</td>
<td valign="top" align="center">C3-Global Pooling Fusion</td>
</tr>
<tr>
<td valign="top" align="left">MSIF</td>
<td valign="top" align="center">Multiscale Information Fusion</td>
</tr>
<tr>
<td valign="top" align="left">WFIF</td>
<td valign="top" align="center">Weighted Feature Information Fusion</td>
</tr>
<tr>
<td valign="top" align="left">UAV</td>
<td valign="top" align="center">Unmanned Aerial Vehicle</td>
</tr>
<tr>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">Convolutional Neural Network</td>
</tr>
<tr>
<td valign="top" align="left">YOLO</td>
<td valign="top" align="center">You Only Look Once</td>
</tr>
<tr>
<td valign="top" align="left">FPN</td>
<td valign="top" align="center">Feature Pyramid Network</td>
</tr>
<tr>
<td valign="top" align="left">PANet</td>
<td valign="top" align="center">Path Aggregation Network</td>
</tr>
<tr>
<td valign="top" align="left">BiFPN</td>
<td valign="top" align="center">Bidirectional Feature Pyramid Network</td>
</tr>
</tbody>
</table>
</table-wrap>
</app></app-group>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3068432/overview">Haichuan Yang</ext-link>, Tokushima University, Japan</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1679918/overview">Nadeem Akhtar</ext-link>, R. C. Patel Institute of Technology, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3285902/overview">S. Ida Evangeline</ext-link>, Government College of Engineering, Tirunelveli, India</p>
</fn>
</fn-group>
</back>
</article>