<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurorobot.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Neurorobotics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurorobot.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1662-5218</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbot.2026.1649168</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Robust federated learning for UAV object detection: a joint self-distillation and drift compensation approach</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Hangsun</surname> <given-names>Yu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3104521"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Jiang</surname> <given-names>Changnan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3411325"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhang</surname> <given-names>Ziyuan</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3414055"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ouyang</surname> <given-names>Heqing</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname> <given-names>Pengpeng</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Communication and Information Engineering, Nanjing University of Posts and Telecommunications</institution>, <city>Nanjing</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Beihang University</institution>, <city>Beijing</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>University of Toronto</institution>, <city>Beijing</city>, <country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>The University of Hong Kong</institution>, <city>Hong Kong</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Changnan Jiang, <email xlink:href="mailto:jcnby@buaa.edu.cn">jcnby@buaa.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-04-01">
<day>01</day>
<month>04</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>20</volume>
<elocation-id>1649168</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>25</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Hangsun, Jiang, Zhang, Ouyang and Chen.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Hangsun, Jiang, Zhang, Ouyang and Chen</copyright-holder>
<license>
<ali:license_ref start_date="2026-04-01">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>The rapid advancement of unmanned aerial vehicles (UAVs) in disaster response and environmental monitoring has underscored the growing importance of real-time object detection within UAV swarm networks. However, the non-independent and identically distributed (non-IID) characteristics of data in UAV networks present significant challenges to model convergence and adaptability. To tackle these challenges, this study introduces a robust federated UAV object detection framework tailored for non-IID data distributions. The framework aims to enhance adaptability across clients, thereby improving both detection performance and convergence speed. Our approach includes a self-distillation mechanism that leverages personalized knowledge from local model historical states to guide current local training, striking a balance between specialization and adaptability. Additionally, we propose a drift compensation mechanism to synchronize local and global model updates, mitigating model drift. We conducted extensive experiments on the VisDrone2019-DET dataset, comparing our method to baseline models. Results demonstrate that our approach accelerates convergence speed by approximately 2.2 times and enhances detection performance by around 3%, offering an efficient and robust solution for UAV-based object detection under non-IID conditions.</p></abstract>
<kwd-group>
<kwd>data heterogeneity</kwd>
<kwd>federated learning</kwd>
<kwd>model drift</kwd>
<kwd>self-distillation</kwd>
<kwd>UAV object detection</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was supported by the Aeronautical Science Foundation of China (2022Z071020002).</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="2"/>
<equation-count count="17"/>
<ref-count count="32"/>
<page-count count="16"/>
<word-count count="9782"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Unmanned aerial vehicles (UAVs) have experienced rapid expansion across various domains, including disaster response, environmental surveillance, and intelligent transportation systems. The efficacy and performance of these applications hinge significantly on real time object detection. These applications demand real-time object detection that can adapt to diverse and dynamic environments. However, deploying object detection on UAVs faces distinct challenges at different stages: at the inference stage, limited computational resources necessitate lightweight models; at the training stage, when multiple UAVs collaborate, centralized data aggregation raises privacy concerns and bandwidth limitations. Recent strides in deep learning have markedly enhanced object detection accuracy in UAV imagery, with the advent of lightweight convolutional approaches enabling the practical implementation of these models on resource constrained UAV platforms (<xref ref-type="bibr" rid="B31">Zhu et al., 2021a</xref>; <xref ref-type="bibr" rid="B20">Wu et al., 2024</xref>). Nevertheless, these methodologies often necessitate centralized data aggregation, posing privacy vulnerabilities and proving unsuitable for decentralized frameworks like multi UAV networks (<xref ref-type="bibr" rid="B6">Hafeez et al., 2024</xref>). Recent surveys and state of the art detection models further illustrate both progress and challenges in UAV based visual perception. A comprehensive survey of deep learning methods for UAV object detection and tracking highlights challenges in small object detection and complex backgrounds (<xref ref-type="bibr" rid="B21">Wu et al., 2021</xref>). An overview of emerging deep learning techniques for UAV imagery emphasizes dense small object scenarios and adaptation to onboard platforms (<xref ref-type="bibr" rid="B17">Tang et al., 2023</xref>).</p>
<p>Federated Learning (FL) has emerged as a prominent method for preserving privacy, attracting considerable attention in research by facilitating collaborative model training across decentralized devices without the need to share raw data (<xref ref-type="bibr" rid="B10">Liu et al., 2021</xref>). This approach holds significant promise for applications involving cooperation among multiple Unmanned Aerial Vehicles. However, UAVs are commonly deployed in diverse environments characterized by variations in object scales, lighting conditions, and scene complexity, factors that can impede model convergence and adaptability (<xref ref-type="bibr" rid="B12">Nguyen et al., 2021</xref>). The resulting data heterogeneity leads to inconsistent local updates and unreliable global aggregation, presenting substantial challenges for the practical implementation of FL in UAV systems. Consequently, FL encounters significant hurdles in tasks such as object detection on UAV platforms. To address these deployment and heterogeneity issues, recent FL solutions such as FedYolo (<xref ref-type="bibr" rid="B27">Zhang et al., 2023</xref>) and ACSFed (<xref ref-type="bibr" rid="B3">Chen et al., 2022</xref>) demonstrate enhanced convergence and resilience in non-IID environments. FedYolo and ACSFed have been incorporated into our baseline comparison to provide a more comprehensive analysis. FedYolo modifies the YOLO detection head for federated scenarios using transformer based attention to address client drift. On the other hand, ACSFed utilizes an adaptive client selection scheme that selects participants dynamically based on divergence and reliability.</p>
<p>Various strategies have been developed to address the challenges faced by Federated Learning in complex visual tasks, such as data heterogeneity and model performance issues (<xref ref-type="bibr" rid="B5">Fu et al., 2022</xref>; <xref ref-type="bibr" rid="B26">Zhagypar et al., 2022</xref>). One such strategy is semi-supervised optimization, which leverages unlabeled data more effectively in federated settings, thereby enhancing model performance in scenarios with limited data availability (<xref ref-type="bibr" rid="B28">Zhang et al., 2022</xref>). Another line of work, exemplified by AutoFed (<xref ref-type="bibr" rid="B29">Zheng et al., 2023</xref>), introduces heterogeneity aware multimodal learning to improve robustness and adaptability in non-IID environments. Additionally, approaches like asynchronous updates and differential privacy mechanisms have been suggested to improve system robustness against client staleness and safeguard data privacy during transmission (<xref ref-type="bibr" rid="B13">Pan et al., 2023</xref>). Nevertheless, these methods do not fully tackle critical issues like local overfitting, personalized knowledge forgetting, and model drift resulting from data heterogeneity. These challenges underscore the necessity for more resilient solutions in federated UAV-based object detection within non-IID environments.</p>
<p>In this study, we introduce a framework termed FL-JSDDC (Federated Learning with Joint Self-Distillation and Drift Compensation) to tackle the challenges arising from non-IID data in federated UAV object detection. Our approach addresses two key aspects. Firstly, at the local model training level, we incorporate a self-distillation mechanism to improve the adaptability of local models and preserve their ability to adapt to client-specific data. Secondly, at the global model aggregation stage, we propose a drift compensation mechanism to alleviate model drift resulting from data heterogeneity. By integrating these two perspectives, we establish a more resilient and effective framework for federated UAV object detection. Our contributions can be succinctly outlined as follows:</p>
<list list-type="bullet">
<list-item><p>We present FL-JSDDC, a new federated UAV object detection framework that integrates self-distillation and drift compensation mechanisms to address the complexities arising from non-independent and identically distributed (non-IID) data in federated UAV object detection. Our method significantly improves the resilience and detection accuracy of federated UAV object detection models in diverse environments.</p></list-item>
<list-item><p>We introduce the self-distillation algorithm to enhance the adaptability of local UAV models by enabling the assimilation of personalized knowledge from past local models. Additionally, we present the drift compensation algorithm, which reduces model drift by dynamically aligning local model updates with the global model. This approach aims to improve model convergence and enhance detection performance in non-IID scenarios.</p></list-item>
<list-item><p>Extensive experiments were carried out on the VisDrone2019-DET dataset to compare FL-JSDDC with multiple baseline models under both IID and non-IID conditions. The findings indicate that FL-JSDDC exhibits notable superiority over current FL methods, manifesting enhanced convergence speed and detection performance, particularly in scenarios characterized by data heterogeneity.</p></list-item>
</list></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec>
<label>2.1</label>
<title>UAV-based object detection</title>
<p>The rapid progress in UAV technology has led to their growing utilization across diverse fields including environmental monitoring, disaster response, and intelligent transportation. Due to their adaptability and aerial vantage point, UAVs are well-suited for object detection tasks. Nevertheless, achieving precise and timely object detection in intricate settings poses a notable obstacle in the realm of computer vision.</p>
<p>Recent advancements in deep learning have greatly improved the ability of drones to detect objects. The TPH-YOLOv5 model integrates a Transformer Prediction Head into the YOLOv5 framework to enhance detection accuracy across various object scales. Additionally, it incorporates a Convolutional Block Attention Module (CBAM) to improve detection performance in crowded environments, as illustrated in the study by <xref ref-type="bibr" rid="B32">Zhu et al. (2021b)</xref>. To improve computational efficiency, a selective image patch processing approach based on convolutional neural networks was proposed by <xref ref-type="bibr" rid="B14">Plastiras et al. (2019)</xref>. By employing attention mechanisms to selectively target object-containing regions, this technique streamlines computation and enhances detection speed without compromising detection accuracy, rendering it particularly suitable for UAV applications with limited resources. <xref ref-type="bibr" rid="B22">Wu et al. (2017)</xref> introduced a vision-based system integrating object detection with Kalman filter-based tracking. This methodology facilitates real-time localization and tracking of aerial targets during UAV missions, demonstrating robustness and responsiveness across diverse environmental settings. A comprehensive survey was conducted by <xref ref-type="bibr" rid="B21">Wu et al. (2021)</xref> to provide an extensive overview of UAV-based object detection and tracking. The study explores the utilization of deep learning in UAV imagery and video analytics, identifies current challenges, and suggests potential research avenues for the future.</p>
<p>Regrettably, despite these advancements, current methods encounter notable challenges in the practical deployment of UAVs. Many existing approaches depend on centralized data collection, leading to privacy issues and impracticality in distributed UAV networks.</p>
</sec>
<sec>
<label>2.2</label>
<title>Federated learning-based object detection</title>
<p>The progress of Federated Learning (FL) has led to its increasing utilization in image recognition and object detection, especially in sensitive contexts like multi UAV collaboration, traffic surveillance, and edge computing. FL facilitates collaborative model training among multiple clients without the need to exchange raw data, effectively addressing issues related to data silos and privacy breaches.</p>
<p>Recent studies have advanced the field by investigating privacy, heterogeneity, and drift resilience in federated object detection. For instance, <xref ref-type="bibr" rid="B18">Wang et al. (2024)</xref> proposed a federated object detection framework incorporating dynamic differential privacy. This approach adjusts noise levels based on feature sensitivity and training progress, achieving competitive detection accuracy on COCO and PASCAL VOC benchmarks while ensuring privacy. Similarly, <xref ref-type="bibr" rid="B16">Rashidi et al. (2024)</xref> introduced a self-configuring federated medical object detection framework that can adapt aggregation strategies to diverse and non-IID medical imaging data. Their work demonstrated feasibility on radiology datasets from multiple institutions. Furthermore, <xref ref-type="bibr" rid="B2">Awaysheh et al. (2024)</xref> conducted an empirical investigation on detecting drift in federated learning. Their study revealed that even minor concept or data drift at a single client can significantly impair the overall model performance, highlighting the critical need for robust drift-aware mechanisms.</p>
<p>The study by <xref ref-type="bibr" rid="B28">Zhang et al. (2022)</xref> introduces a robust semi-supervised federated learning (SSFL) framework tailored for UAV-based image recognition tasks. The framework proposes a FedMix parameter mixing strategy to facilitate model transfer between the server and clients. It also incorporates a FedFreq aggregation rule to dynamically adjust client weights, thereby enhancing model detection performance under non-IID data distributions. This approach effectively balances privacy preservation and model accuracy, showcasing the potential of SSFL in aerial image analysis. In a real-time scenario, <xref ref-type="bibr" rid="B24">Xie et al. (2023)</xref> present an asynchronous federated learning (AFL) framework for multi-license plate recognition. This method combines semantic communication with multi-task learning (MTL), formulating the problem as a multi-objective optimization task. The optimization is solved using the multi-gradient descent algorithm (MGDA). The asynchronous strategy proposed addresses client staleness during training, thereby improving system robustness and communication efficiency. Moreover, to enhance multi-task recognition while ensuring privacy, <xref ref-type="bibr" rid="B23">Xie et al. (2024)</xref> introduce a differentially private federated multi-task learning framework (DP-FL). This framework formulates object recognition as a multi-objective task and integrates local differential privacy by injecting calibrated noise into local gradients. Experimental results demonstrate that the method achieves strong detection performance and privacy protection across various benchmark datasets. <xref ref-type="bibr" rid="B8">Khazaei et al. (2025)</xref> introduced an optimal server representation (OSR) based Federated Learning (FL) framework to enhance training efficiency and minimize communication overhead. This approach involves sharing a limited set of privacy-preserving representative samples instead of transmitting model parameters to reduce bandwidth usage. Additionally, the framework incorporates knowledge distillation to assist local models in acquiring global representations, consequently enhancing recognition accuracy in visual tasks.</p>
<p>Despite recent advancements, current methods face significant challenges. While privacy and communication efficiency have been studied, ensuring consistent model accuracy across heterogeneous data distributions remains a critical hurdle due to the non-IID nature of data across clients. These challenges underscore the necessity for more robust and precise solutions in real-world federated UAV object detection scenarios.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>System model and problem description</title>
<p>In this section, we introduce the system model, problem description and design objective considered in this work.</p>
<sec>
<label>3.1</label>
<title>System model</title>
<sec>
<label>3.1.1</label>
<title>System architecture</title>
<p>We consider a federated UAV object detection framework consisting of a central server (base station) and multiple UAV clients. Each UAV is equipped with a certain amount of onboard image data and computational resources. The server coordinates the UAVs to collaboratively build the federated object detection model. Without sharing raw data, the system performs global aggregation to generate a high quality global object detection model.</p>
<p>The system, depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>, comprises a central server (base station) and multiple clients (UAVs). In each communication round, individual UAV clients train a local object detection model using onboard visual data and transmit the updated model parameters to the central server. Upon receiving the local models from all clients, the server aggregates them to update a global model, which is then disseminated to all UAVs for subsequent training rounds. This decentralized learning approach eliminates the necessity to exchange raw data, thereby safeguarding privacy. Let <italic>S</italic> represent the central server, and let the <italic>N</italic> participating UAVs be denoted as distributed clients within the system. Each client <italic>i</italic> possesses a local dataset <italic>D</italic><sub><italic>i</italic></sub>, with data distributions among clients typically being non-IID. Local training is conducted exclusively based on the dataset of each client.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Scenario diagram of a federated UAV object detection framework.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-20-1649168-g0001.tif">
<alt-text content-type="machine-generated">Diagram illustrating a federated learning system with unmanned aerial vehicles (UAVs) collecting local data and training object detection models on different objects, then sending local models to a base station for global model aggregation.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>3.1.2</label>
<title>Local training objective</title>
<p>The objective of each client <italic>i</italic> is to minimize the following local loss function during on device training:</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>Y</mml:mi><mml:mi>O</mml:mi><mml:mi>L</mml:mi><mml:mi>O</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:mi>R</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where <inline-formula><mml:math id="M2"><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> denotes the local model parameters of client <italic>i</italic>, and <italic>D</italic><sub><italic>i</italic></sub> represents the local dataset. Here, <italic>x</italic><sub><italic>j</italic></sub> is the input image and <italic>y</italic><sub><italic>j</italic></sub> is the corresponding annotated label including bounding boxes and class information. The term <italic>R</italic>(<italic>w</italic><sub><italic>i</italic></sub>) is a regularization function, such as L2 regularization, and &#x003B2; is a nonnegative scalar that controls the regularization strength. The function &#x003B6;<sub><italic>YOLO</italic></sub> denotes the YOLOv8 object detection loss, which typically consists of three components: classification loss, objectness confidence loss, and bounding box regression loss.</p>
<disp-formula id="EQ2"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>Y</mml:mi><mml:mi>O</mml:mi><mml:mi>L</mml:mi><mml:mi>O</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>Specifically, the classification loss, objectness confidence loss, and bounding box regression loss are defined as follows:</p>
<disp-formula id="EQ3"><mml:math id="M4"><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:mrow><mml:msubsup><mml:mi>y</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:mstyle><mml:mi>log</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msubsup><mml:mi>y</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mi>log</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>j</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:msubsup><mml:mi>y</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mi>log</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>j</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>u</mml:mi><mml:mi>n</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math><label>(3)</label></disp-formula>
<p>where <italic>C</italic> is the number of object categories, <inline-formula><mml:math id="M5"><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> is the ground truth label for class <italic>c</italic>, and <inline-formula><mml:math id="M6"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> is the predicted class probability. <inline-formula><mml:math id="M7"><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> indicates the presence of an object, and <inline-formula><mml:math id="M8"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the predicted objectness score. <italic>A</italic><sub><italic>int</italic></sub> denotes the intersection area between the predicted and ground truth bounding boxes, and <italic>A</italic><sub><italic>uni</italic></sub> denotes their union area.</p></sec>
<sec>
<label>3.1.3</label>
<title>Global aggregation objective</title>
<p>After each communication round, the central server collects updated local model parameters from all clients and aggregates them to update the global model using a weighted averaging strategy: <inline-formula><mml:math id="M9"><mml:mi>w</mml:mi><mml:mo>=</mml:mo><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, where <italic>w</italic> denotes the global model parameters, and the weight <italic>p</italic><sub><italic>i</italic></sub> is determined by the relative size of each client&#x00027;s local dataset:<inline-formula><mml:math id="M10"><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac></mml:math></inline-formula>, the overall optimization objective of the system is to minimize the global empirical risk:</p>
<disp-formula id="EQ4"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>where <italic>F</italic><sub><italic>i</italic></sub>(<italic>w</italic>) represents the local loss function of client <italic>i</italic> evaluated on the global model <italic>w</italic>. This objective ensures that the aggregated model is jointly optimized across all clients, with contributions weighted by data volume, thereby producing a globally shared model that strikes the best compromise in applicability across all clients.</p>
</sec>
</sec>
<sec>
<label>3.2</label>
<title>Problem description</title>
<p>Although federated UAV-based object detection systems have shown great potential in distributed collaborative training and privacy preserving applications, their practical deployment, especially for complex tasks such as object detection, still faces significant challenges. One of the most critical issues is the detection performance degradation and convergence difficulty caused by data heterogeneity, that is, non-IID data across clients.</p>
<p>In a conventional federated detection framework, it is typically assumed that the data distributions across clients are similar. In practical federated UAV object detection scenarios, this assumption often does not hold. The local data distribution <italic>P</italic><sub><italic>i</italic></sub>(<italic>x, y</italic>) held by client <italic>k</italic> often differs significantly from the global data distribution. Taking input samples <italic>x</italic> and predicted class labels <italic>y</italic> as an example, the local training process typically aims to minimize the cross entropy loss to maximize the conditional probability <italic>P</italic>(<italic>y</italic>|<italic>x</italic>), i.e.,</p>
<disp-formula id="EQ5"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0007E;</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mo class="qopname">log</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p>According to Bayes&#x00027; theorem, the conditional probability <italic>P</italic>(<italic>y</italic>|<italic>x</italic>) can be rewritten as:</p>
<disp-formula id="EQ6"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi><mml:mo>|</mml:mo><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>|</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>where <italic>P</italic>(<italic>x</italic>|<italic>y</italic>) is the class conditional distribution, <italic>P</italic>(<italic>y</italic>) is the class prior distribution, and <italic>P</italic>(<italic>x</italic>) is the marginal distribution of input features. This decomposition shows that local model optimization is influenced not only by the conditional distribution <italic>P</italic>(<italic>x</italic>|<italic>y</italic>) but also by the class prior <italic>P</italic>(<italic>y</italic>).</p>
<p>However, in practical federated UAV object detection systems, heterogeneity in client data, caused by variations in sensing devices, environments, and application scenarios, leads to shifts in both <italic>P</italic>(<italic>x</italic>|<italic>y</italic>) and <italic>P</italic>(<italic>y</italic>). These shifts cause local models to overfit to client data distributions and converge toward different optima, which leads to inconsistent updates and degraded global detection performance during aggregation.</p>
</sec>
<sec>
<label>3.3</label>
<title>Design objectives</title>
<p>To address the detection performance degradation caused by data heterogeneity in federated UAV object detection framework, this paper proposes an improved training mechanism based on the idea of optimizing local adaptability and global model consistency. This mechanism incorporates self-distillation and drift compensation to mitigate the impact of model drift, enhance robustness and adaptability, and thereby accelerate convergence and improve detection performance.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Methodology</title>
<sec>
<label>4.1</label>
<title>Proposed method</title>
<p>To address the challenges of data heterogeneity and model drift in federated UAV object detection, we propose a novel method named <bold>FL-JSDDC</bold>. Built upon the YOLOv8n detection backbone, FL-JSDDC combines local knowledge transfer and global model alignment strategies to improve detection performance and convergence speed under non-IID distributions.</p>
<p>The overall framework is illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>, and consists of the following two core modules:</p>
<list list-type="simple">
<list-item><p><bold>(1) Self-distillation mechanism:</bold> This module leverages personalized knowledge from the historical local models to guide current training. It preserves client specific representations and mitigates local forgetting by aligning past knowledge with current updates.</p></list-item>
<list-item><p><bold>(2) Drift compensation mechanism:</bold> This module reduces global model drift caused by non-IID data by explicitly estimating and compensating for the deviation between local and global models. It helps align model updates across clients for more stable aggregation.</p></list-item>
</list>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Overview of the FL-JSDDC model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-20-1649168-g0002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a federated learning workflow for multiple UAVs with steps labeled: global model broadcast, local training using self-knowledge distillation and drift compensation, storage, sending back locally trained models, and model aggregation on an FL server.</alt-text>
</graphic>
</fig>
<p>The overall process of FL-JSDDC proceeds in the following five steps:</p>
<list list-type="simple">
<list-item><p><bold>(1) Local training:</bold> Each client initializes its local model with the global model and performs training on its local dataset.</p></list-item>
<list-item><p><bold>(2) Self-distillation mechanism:</bold> During training, each client employs a self-distillation mechanism that uses historical local models to generate soft targets, helping retain personalized knowledge and guide current training.</p></list-item>
<list-item><p><bold>(3) Drift compensation mechanism:</bold> Upon completing training, each client computes an drift vector between the updated local model and its historical local model, to compensate for model drift caused by non-iid data.</p></list-item>
<list-item><p><bold>(4) Model aggregation:</bold> The compensated local models are uploaded to the server. The server aggregates them to produce a globally consistent model.</p></list-item>
<list-item><p><bold>(5) Model broadcast:</bold> The updated global model is then sent back to all clients, initiating the next round of federated training.</p></list-item>
</list>
<p>In each communication round, the self-distillation mechanism first enhances local adaptability by leveraging historical knowledge, followed by the drift compensation mechanism that aligns local updates with the global objective, improvements are made from both local training and global aggregation perspectives to ensure robust across non-IID data environments.</p>
<sec>
<label>4.1.1</label>
<title>Self-distillation mechanism</title>
<p>We introduce a supervised self-distillation framework for individual client detectors, comprising five consecutive stages: an input block, a bounding box alignment block, a global class probability alignment block, a channel semantic alignment block, and an output block. Raw images remain within the UAV client and knowledge transfer occurs from both the historical personalized local model and the global model (teacher) to the local model (student) through the three alignment blocks delineated below.</p>
<p><bold>(1) Input block</bold> After standard preprocessing, each minibatch is forwarded through both the historical local model (teacher) and the current local model (student) via parallel forward propagation. The historical local model is retained from previous communication rounds and serves as a personalized knowledge reservoir. This process yields corresponding classification logits <inline-formula><mml:math id="M14"><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> for each detection token <italic>j</italic> &#x0003D; 1, &#x02026;, <italic>M</italic>, which are used in the global class probability alignment loss &#x003B6;<sub><italic>G</italic></sub> in <xref ref-type="disp-formula" rid="EQ8">Equation 8</xref>. Simultaneously, it extracts intermediate feature maps <inline-formula><mml:math id="M15"><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mrow><mml:mi mathvariant="script">F</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mrow><mml:mi mathvariant="script">F</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> from head layer &#x02113;, where <inline-formula><mml:math id="M16"><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> denotes the activation tensor at head layer &#x02113;; Together, the generated outputs including classification logits, objectness scores, and spatial features form the basis for subsequent knowledge alignment.</p>
<p><bold>(2) Bounding Box Alignment block</bold> To distill localization knowledge, we align the regression predictions between the teacher and student models. Specifically, let the predicted normalized bounding box coordinates for each detection token <italic>j</italic> be denoted as <inline-formula><mml:math id="M17"><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M18"><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></inline-formula> for the teacher and student models, respectively. The bounding box alignment loss is computed as:</p>
<disp-formula id="EQ7"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>where ||&#x000B7;||<sub>2</sub> denotes the &#x02113;<sub>2</sub> norm. The vector <italic>b</italic><sup>(<italic>j</italic>)</sup>&#x02208;&#x0211D;<sup>4</sup> contains the center coordinates (<italic>x</italic>, <italic>y</italic>), width <italic>w</italic>, and height <italic>h</italic> of the bounding box for token <italic>j</italic>.</p>
<p>This loss encourages the student model to produce bounding boxes that are spatially consistent with the teacher&#x00027;s regression outputs, thereby improving localization accuracy under non-IID data distributions.</p>
<p><bold>(3) Global class-probability alignment block</bold> Let <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> denote the teacher and student classification logits of the <italic>j</italic>-th detection token. Following temperature scaling, the alignment loss is as shown in <xref ref-type="disp-formula" rid="EQ8">Equation 8</xref>.</p>
<disp-formula id="EQ8"><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">KL</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>/</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02225;</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>/</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>where &#x003C3;(&#x000B7;) is the sigmoid (multi-label) or softmax (single-label) function, <italic>C</italic> is the number of classes, &#x003C4;<sub><italic>G</italic></sub>&#x0003E;0 is the temperature, and KL(&#x000B7;&#x02225;&#x000B7;) denotes the Kullback-Leibler divergence. Minimizing <xref ref-type="disp-formula" rid="EQ8">Equation 8</xref> transfers high-level semantic priors from teacher to student.</p>
<p><bold>(4) Channel-semantic alignment block</bold> To further improve spatial awareness, we align the relative channel importances between teacher and student over head layer &#x02113; (including three feature maps P3, P4, and P5). This process consists of two steps: extracting spatial channel means and then computing a KL divergence over their softmax scaled distributions.</p>
<p>To align the channel semantics, we apply temperature scaled softmax followed by KL divergence, as shown in <xref ref-type="disp-formula" rid="EQ9">Equation 9</xref>:</p>
<disp-formula id="EQ9"><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003B5;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">KL</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo class="qopname">softmax</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mrow><mml:mi mathvariant="script">F</mml:mi></mml:mrow></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02225;</mml:mo><mml:mo class="qopname">softmax</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mrow><mml:mi mathvariant="script">F</mml:mi></mml:mrow></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>where &#x003B5;<sub><italic>a</italic></sub>&#x02208;&#x0211D; is a layer specific importance coefficient used to emphasize different feature map layers (<italic>P</italic>3, <italic>P</italic>4, and <italic>P</italic>5) during the channel semantic alignment process. Here, <italic>a</italic> &#x0003D; 1, 2, 3 correspond to the three feature maps at different spatial resolutions, and &#x02113;<sub><italic>a</italic></sub> denotes the associated head layer in the YOLOv8 architecture. The feature tensor at layer &#x02113;<sub><italic>a</italic></sub> is denoted as <inline-formula><mml:math id="M23"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">F</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>. To improve alignment effectiveness, we assign larger weights to lower level features by setting &#x003B5;<sub>1</sub>&#x0003E;&#x003B5;<sub>2</sub>&#x0003E;&#x003B5;<sub>3</sub>, which strengthens the alignment focus on <italic>P</italic>3 and enhances the detection of small objects commonly present in UAV imagery. The parameter &#x003C4;<sub><italic>C</italic></sub>&#x0003E;0 represents the temperature scaling factor used in the channel semantic alignment loss.</p>
<p><bold>(5) Output block and unified objective</bold> Based on the above calculations, the student is optimized with a composite loss, as shown in <xref ref-type="disp-formula" rid="EQ10">Equation 10</xref>:</p>
<disp-formula id="EQ10"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>where &#x003B6;<sub><italic>CE</italic></sub> is the standard classification loss, &#x003B6;<sub><italic>box</italic></sub> is the bounding box regression term, and &#x003BB;<sub>1</sub>, &#x003BB;<sub>2</sub>, &#x003BB;<sub>3</sub>&#x02265;0 weight the alignment losses. This joint objective enforces global consistency while retaining local personalization, thus alleviating overfitting and drift in heterogeneous federated environments.</p></sec>
<sec>
<label>4.1.2</label>
<title>Drift compensation mechanism</title>
<p>In federated UAV object detection, the disparate distribution of images among devices results in incongruent local optimization goals, leading to divergent model update trajectories and subsequent model drift. To mitigate this issue, FL-JSDDC proposes the incorporation of a local drift variable for modeling and correction. Each client is tasked with managing a drift vector &#x003C6;<sub><italic>i</italic></sub> &#x0003D; <italic>w</italic>&#x02212;<italic>w</italic><sub><italic>i</italic></sub> to monitor the variance between its local model and the prevailing global model, subsequently fine tuning the model parameters prior to aggregation. To provide an approximate assessment of the local model drift, a drift regularization term is introduced as follows:</p>
<disp-formula id="EQ11"><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x003C8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mi>w</mml:mi><mml:mo>|</mml:mo><mml:msup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula> 
<p>Each client uses this regularization term and the empirical loss term on its corresponding dataset to train both the model parameters and the local drift variable, thus converting the constrained optimization problem into an unconstrained one.</p>
<p>In FL-JSDDC, the drift compensation loss for each client consists of two components: the drift regularization term and the gradient correction term. The drift regularization term is used to penalize the deviation of the local model from the global model, while the gradient correction term adjusts the local model toward the global model by correcting the local gradient during the training process, as shown in <xref ref-type="disp-formula" rid="EQ12">Equation 12</xref>:</p>
<disp-formula id="EQ12"><mml:math id="M37"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>D</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:msup><mml:mrow><mml:mi>&#x003C8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BE;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>where, &#x003B1; is the weight coefficient for the drift regularization term, and &#x003BE;<sub><italic>i</italic></sub> controls the gradient correction term for stochastic gradient optimization. To smooth out gradient heterogeneity, we set the gradient correction term as <inline-formula><mml:math id="M38"><mml:msub><mml:mrow><mml:mi>&#x003BE;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x003B7;</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:mo>&#x02329;</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mo>&#x0232A;</mml:mo></mml:mrow></mml:math></inline-formula>, where &#x003B7; is the learning rate, and <italic>E</italic> is the number of training iterations in one round. &#x003BC;<sub><italic>i</italic></sub> is the local update of the <italic>i</italic>-th client&#x00027;s local parameters in the previous round, and &#x003BC; is the average update of all clients&#x00027; local parameters in the previous round. In the <italic>t</italic>-th round, we have: <inline-formula><mml:math id="M39"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="M40"><mml:msup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, where <inline-formula><mml:math id="M41"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M42"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> are the local model parameters of client <italic>i</italic> in the <italic>t</italic>-th and (<italic>t</italic>&#x02212;1)-th rounds, respectively. The purpose of this term is to reduce the variance of the local gradients.</p>
<p>Next, we introduce the update rule for the local drift variable &#x003C6;<sub><italic>i</italic></sub>. Taking the <italic>t</italic>&#x0002B;1-th iteration as an example, in FL-JSDDC, the local drift variable tracks the gap between the local model and the global model. In the <italic>t</italic>&#x0002B;1-th round of training, we assume that the global model parameters are updated to <italic>w</italic><sup><italic>t</italic>&#x0002B;1</sup>, while the local model parameters remain fixed. Then we can update the local drift variable using <inline-formula><mml:math id="M43"><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, but due to the unavailability of global data, it is impossible to directly update the global model.</p>
<p>Assume we first update the local model parameters from <inline-formula><mml:math id="M44"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> to <inline-formula><mml:math id="M45"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>. Then we consider the following two points: 1) At the beginning of each round, the local model parameters are assigned to the global model parameters: <inline-formula><mml:math id="M46"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>. 2) For client <italic>i</italic>, the local model parameters <inline-formula><mml:math id="M47"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> are an estimate of the updated global model parameters <italic>w</italic><sup><italic>t</italic>&#x0002B;1</sup>. Thus, we can approximate the update of the local drift variable as:</p>
<disp-formula id="EQ13"><mml:math id="M48"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02248;</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<p>In this way, we use the update of the local model parameters to adjust the local drift variable (<xref ref-type="statement" rid="algo1">Algorithm 1</xref> and <xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<statement content-type="algorithm" id="algo1">
<label>Algorithm 1</label>
<title>FL-JSDDC: federated learning with joint self- distillation and drift compensation.</title>
<p>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-20-1649168-i0001.tif"/>
</p>
</statement>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Simulated data distribution across UAV clients under three partitioning schemes. From left to right: <bold>(a)</bold> IID setting where class distributions are uniform across clients. <bold>(b)</bold> ND(1) setting with mild label imbalance; and <bold>(c)</bold> ND(2) setting exhibiting severe non-IID distributions generated via a Dirichlet process. C1&#x02013;C10 denote the ten object categories in the VisDrone2019-DET dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-20-1649168-g0003.tif">
<alt-text content-type="machine-generated">Three colored stacked bar charts compare the distribution of ten sample categories, labeled C1 through C10, across five UAV IDs for three settings: IID, NonIID1, and NonIID2. In IID, bars are evenly distributed; NonIID1 and NonIID2 settings display increasing unevenness among UAVs and sample categories, with NonIID2 showing the most skewed distribution. Legends are identical for all charts.</alt-text>
</graphic>
</fig>
<p>To update the global model parameters, each client uses its local drift variable <inline-formula><mml:math id="M49"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> to correct the local model parameters before model aggregation. Then, each client uploads the corrected local parameters to the server. Similar to FedAvg, the server performs a weighted average of the corrected local parameters to obtain the global model parameters:</p>
<disp-formula id="EQ14"><mml:math id="M50"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mi>D</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<p>where <italic>D</italic> is the global dataset. The final joint local objective function in FL-JSDDC is formulated as:</p>
<disp-formula id="EQ15"><mml:math id="M51"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B6;</mml:mi></mml:mrow><mml:mrow><mml:mi>D</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<p>By jointly integrating self-distillation and drift compensation, FL-JSDDC enhances both knowledge transfer and parameter alignment during local training. This leads to faster convergence and improved detection performance under non-IID data distributions.</p>
</sec>
</sec>
<sec>
<label>4.2</label>
<title>Procedure of the proposed algorithm</title>
<p>In this section, we outline the key steps of the proposed FL-JSDDC algorithm, which integrates both self-distillation and drift compensation mechanisms to tackle data heterogeneity in federated object detection.</p>
<p>The algorithm proceeds through the following steps:</p>
<list list-type="simple">
<list-item><p><bold>(1) Initialization:</bold> At the beginning of each communication round <italic>t</italic>, the server broadcasts the global model <italic>w</italic><sup><italic>t</italic></sup> to all clients. Each client <italic>i</italic> initializes its local model <inline-formula><mml:math id="M52"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x02190;</mml:mo><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> and prepares its drift variable <inline-formula><mml:math id="M53"><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, which is either retained from the previous round or initialized to zero.</p></list-item>
<list-item><p><bold>(2) Local training:</bold> Each client performs local training on its private dataset <italic>D</italic><sub><italic>i</italic></sub> using an objective that combines supervised detection loss, self-distillation loss (guided by the historical model <inline-formula><mml:math id="M54"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:math></inline-formula>), and an drift compensation loss designed to address model drift.</p></list-item>
<list-item><p><bold>(3) Parameter update:</bold> The client updates its local model parameters based on the total loss and adjusts the drift variable <inline-formula><mml:math id="M55"><mml:msubsup><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> to account for the parameter change since the previous round.</p></list-item>
<list-item><p><bold>(4) Transmission:</bold> Each client uploads a drift compensated version of its updated model, denoted as <inline-formula><mml:math id="M56"><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:math></inline-formula>, which combines the local model and the corresponding drift variable.</p></list-item>
<list-item><p><bold>(5) Global aggregation:</bold> The server aggregates the received drift compensated models from all clients using a weighted average to obtain the updated global model <italic>w</italic><sup><italic>t</italic>&#x0002B;1</sup> for the next round.</p></list-item>
</list>
<p>The FL-JSDDC algorithm integrates client-specific features with global model alignment to enhance convergence speed and detection performance in non-IID environments. Through the incorporation of self-distillation and drift compensation, this algorithm establishes a resilient and effective federated UAV object detection framework in scenarios with non-IID data distributions.</p>
<p>The FL-JSDDC framework, as proposed, entails a moderate increase in computational load. Self-distillation necessitates an additional forward pass and auxiliary loss computations, resulting in a per-batch cost approximately 1.4-1.6 &#x000D7; higher than that of conventional YOLOv8n training. Conversely, drift compensation entails lightweight vector operations with minimal overhead. Both components operate exclusively on the client side, without augmenting communication requirements. The accelerated convergence of FL-JSDDC diminishes the total number of rounds, thereby counterbalancing the augmented local computational burden.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Experiments</title>
<p>In this section, we assess the efficacy of the FL-JSDDC method by contrasting it with various advanced FL approaches across diverse datasets and conditions. The assessment centers on two key metrics: (1) convergence rate and (2) model detection accuracy.</p>
<sec>
<label>5.1</label>
<title>Datasets</title>
<p>Experiments were conducted using the VisDrone2019-DET dataset (<xref ref-type="bibr" rid="B30">Zhu et al., 2019</xref>), a prominent object detection dataset extensively applied in smart surveillance and autonomous driving contexts. The dataset comprises 10,209 images in total (6,471 training, 548 validation, and 3,190 test images), with over 540,000 annotated bounding boxes across 10 object categories: pedestrian, person, car, van, bus, truck, motor, bicycle, awning-tricycle, and tricycle. The dataset poses notable challenges due to dense object presence, intricate backgrounds, and occlusions, rendering it ideal for assessing detection accuracy in real world federated UAV object detection scenarios.</p>
<p>To systematically evaluate the detection performance of various FL algorithms under different data heterogeneity scenarios, we construct three typical data partitioning schemes based on the VisDrone2019-DET dataset, as illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Detection Performance comparison of different FL algorithms over 50 communication rounds under <bold>(a)</bold> IID setting, <bold>(b)</bold> ND(1) setting, and <bold>(c)</bold> ND(2) setting. The proposed FL-JSDDC consistently outperforms baseline methods across all settings in both convergence speed and final detection performance.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-20-1649168-g0004.tif">
<alt-text content-type="machine-generated">Three line charts labeled (a), (b), and (c) display mAP@50 values over 50 communication rounds for ten federated learning methods. Each chart has a legend listing FedAvg, FedNova, FedProx, FedPer, FedMIX, FedYolo, ACSFed, FL_SD, FL_DC, and FL_JSDDC, with FL_DC consistently showing the highest mAP@50 performance across all charts.</alt-text>
</graphic>
</fig>
<p>In the <bold>IID setting</bold>, all UAV clients sample data uniformly from the entire dataset, ensuring similar class distributions across clients. This setting approximates an ideal environment with independent and identically distributed (IID) data. In the <bold>ND(1) setting</bold>, we introduce moderate label imbalance by slightly altering the class distributions among clients. This simulates practical scenarios where different UAVs observe different patterns due to varying perspectives or environments. In the <bold>ND(2) setting</bold>, we employ a Dirichlet distribution with concentration parameter &#x003B1; &#x0003D; 0.3 to generate severely skewed class distributions. The small &#x003B1; value produces highly imbalanced allocations where some clients may have minimal or no samples for certain classes. This significantly increases the heterogeneity across clients, where each client may contain only a limited subset of classes or even a single class. Such a setting realistically reflects challenges like data silos and class absence in practical federated UAV object detection deployments. These partitioning schemes provide a comprehensive foundation for evaluating model robustness in increasingly heterogeneous federated environments.</p>
</sec>
<sec>
<label>5.2</label>
<title>Baselines</title>
<p>We conducted a comparative analysis of FL-JSDDC with various FL methodologies, such as FedAvg (<xref ref-type="bibr" rid="B11">Mehta and Aneja, 2024</xref>), FedNova (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>), FedProx (<xref ref-type="bibr" rid="B9">Li et al., 2020</xref>), FedPer (<xref ref-type="bibr" rid="B1">Arivazhagan et al., 2019</xref>), FedYolo (<xref ref-type="bibr" rid="B27">Zhang et al., 2023</xref>), and ACSFed (<xref ref-type="bibr" rid="B3">Chen et al., 2022</xref>) in the context of federated learning. FedAvg employs weighted averaging of local updates to update the global model. FedNova introduces a normalization mechanism to mitigate local update bias in non-IID scenarios. FedProx incorporates a proximal regularization term to restrict local update deviation. FedPer enhances model robustness by refining the update process. In addition to classification tasks, we consider FedMIX (<xref ref-type="bibr" rid="B25">Yoon et al., 2021</xref>) as a benchmark for object detection in federated settings. FedYolo employs a modular architecture and pretrained transformer components to improve adaptation in federated settings, addressing inter-client variability through spatial-wise representation learning. ACSFed employs an Earth Mover&#x00027;s Distance-based method to evaluate client contributions and selectively engages clients to alleviate the effects of subpar or significantly divergent local updates. In contrast to FL-JSDDC, these strategies prioritize preserving model coherence between clients and the server, rather than directly addressing the challenge of local model deviation during parameter aggregation.</p>
<p>To further validate the robustness of our method, we conduct ablation studies by separately evaluating: FL-DC, which applies only drift compensation; FL-SD, which applies only self-distillation; and the full FL-JSDDC framework that combines both modules.</p>
</sec>
<sec>
<label>5.3</label>
<title>Implementation details</title>
<p>We adopt a standard FL architecture in our experiments. During each communication round, multiple clients perform local training on their private datasets and upload model updates to a central server for global aggregation. All experiments were conducted on a server with NVIDIA RTX 3090 GPU. Following the settings of the baseline methods (<xref ref-type="bibr" rid="B11">Mehta and Aneja, 2024</xref>; <xref ref-type="bibr" rid="B9">Li et al., 2020</xref>; <xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>; <xref ref-type="bibr" rid="B1">Arivazhagan et al., 2019</xref>; <xref ref-type="bibr" rid="B27">Zhang et al., 2023</xref>; <xref ref-type="bibr" rid="B3">Chen et al., 2022</xref>; <xref ref-type="bibr" rid="B25">Yoon et al., 2021</xref>), we use the Adam optimizer for all local training, along with cosine annealing learning rate scheduling. For consistency across experiments, the batch size for local training is set to 32, the number of local training epochs is 15, and the initial learning rate is 0.001. These settings align with best practices in federated object detection tasks.</p>
<p>We employ the YOLOv8n variant as the backbone detector. To demonstrate deployment feasibility on real UAV platforms, we evaluate inference performance on an NVIDIA Jetson Nano (128-core Maxwell GPU, Quad-core ARM Cortex-A57 CPU, and 4 GB LPDDR4 memory), operating within 5&#x02013;10 W and weighing about 300 g. This setup achieves over 20 FPS at 640 &#x000D7; 480 resolution, and fits UAVs with a payload capacity of 1&#x02013;2 kg, such as the DJI Matrice 200/300 series, ensuring real time applicability under resource constraints.</p>
<p>For the proposed FL-JSDDC algorithm, the drift compensation coefficient is set to &#x003B1; &#x0003D; 0.08. The weights for self-distillation alignment losses are set as &#x003BB;<sub>1</sub> &#x0003D; 0.3 for bounding box alignment, &#x003BB;<sub>2</sub> &#x0003D; 0.3 for global class probability alignment, and &#x003BB;<sub>3</sub> &#x0003D; 0.4 for channel semantic alignment. The L2 regularization weight is set to &#x003B2; &#x0003D; 0.01. The number of participating clients is set to 5 across all experiments. For FedProx, the proximal regularization parameter &#x003BC; is set to 0.1 for fair comparison. The hyperparameters of other baseline methods are kept consistent with those reported in their original publications.</p>
</sec>
<sec>
<label>5.4</label>
<title>Evaluation metrics</title>
<p>In this study, we adopt convergence speed and model detection performance under data heterogeneity as the primary evaluation metrics, which jointly reflect the robustness of the model. Specifically, convergence speed is measured by the number of communication rounds required to reach a target detection performance, to evaluate the overall detection performance of the model on object detection tasks, we adopt mean Average Precision at an IoU threshold of 0.5 (<bold>mAP&#x00040;50</bold>) as the primary detection performance metric, as it reflects both classification correctness and localization precision under practical conditions.</p>
<p>Specifically, <bold>mAP&#x00040;50</bold> is defined as the mean of Average Precision (AP) values computed across all object categories. For each category <italic>c</italic>&#x02208;{1, &#x02026;, <italic>C</italic>}, the AP is obtained by integrating the precision recall curve:</p>
<disp-formula id="EQ16"><mml:math id="M57"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">AP</mml:mtext></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x0222B;</mml:mo></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext class="textrm" mathvariant="normal">d</mml:mtext><mml:mi>r</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<p>where P<sub><italic>c</italic></sub>(<italic>r</italic>) denotes the precision at recall level <italic>r</italic> for class <italic>c</italic>. A prediction is considered correct if the Intersection over Union (IoU) between the predicted bounding box and the ground truth box exceeds 0.5.</p>
<p>The final mAP&#x00040;50 is computed by averaging over all <italic>C</italic> categories:</p>
<disp-formula id="EQ17"><mml:math id="M58"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">mAP&#x00040;50</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">AP</mml:mtext></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula>
<p>This robustness metric has gained increasing attention in recent FL research (<xref ref-type="bibr" rid="B4">FedTest, 2025</xref>; <xref ref-type="bibr" rid="B7">Hu et al., 2024</xref>). By considering both aspects, our robustness metric provides a comprehensive perspective for evaluating the robustness of each method under various data distribution scenarios. It enables deeper insights into how different algorithms cope with data heterogeneity and local model drift in federated object detection environment.</p>
</sec>
<sec>
<label>5.5</label>
<title>Evaluation tasks</title>
<p>In this study, we design a series of evaluation tasks to comprehensively assess the detection performance of FL algorithms under different data heterogeneity and client scale conditions. All experiments are conducted based on the VisDrone2019-DET dataset.</p>
<p>To simulate varying degrees of data heterogeneity, we consider three distinct distribution settings. In the uniform distribution scenario, each client receives data uniformly and randomly sampled from the entire dataset, ensuring that all clients share a similar class distribution. In the mildly non-IID scenario, the data is still randomly assigned, but the class distribution varies slightly across clients, which reflects practical cases where different clients may observe different data patterns. In the highly non-IID setting, the client data is partitioned using a Dirichlet distribution (<xref ref-type="bibr" rid="B15">Qu et al., 2022</xref>) to induce significant disparity in class proportions among clients, thereby emulating extreme heterogeneity in practical federated UAV object detection scenarios.</p>
<p>Furthermore, to evaluate model detection performance and convergence speed under different system scales, we conduct experiments involving 5, 15, and 35 participating clients under each of the above distribution scenarios. These tasks enable us to analyze the robustness of each algorithm in response to increasing data heterogeneity and client population, particularly focusing on how detection performance evolves as distribution skewness and the number of participants grow.</p>
</sec>
<sec>
<label>5.6</label>
<title>Results and analysis</title>
<p>We conduct extensive experiments to validate the advantages of FL-JSDDC in terms of convergence speed and model detection performance. The proposed method is evaluated under varying participation levels, client scales, and degrees of data heterogeneity to assess its robustness. All reported results represent the mean detection performance of local models evaluated on each client&#x00027;s respective test set, aggregated over 10 repeated experiments to ensure statistical reliability.</p>
<p>Since both FL-JSDDC and the baseline algorithms consume the same computational resources per communication round, the detection performance objectives of FL-JSDDC are twofold: 1) to accelerate model convergence and reduce communication overhead, and 2) to improve detection performance across diverse training environments. The results demonstrate that the robustness of FL-JSDDC consistently outperforms that of existing federated object detection optimization methods, as shown in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>We denote the communication round of each method to achieve the target detection performance as (R&#x00023;) and the corresponding convergence speedup relative to FedAvg as (S&#x02191;).</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="left" colspan="6">5 Clients</th>
<th valign="top" align="left" colspan="6">15 Clients</th>
<th valign="top" align="left" colspan="6">35 Clients</th>
</tr>
<tr>
<th/>
<th valign="top" align="left" colspan="2">ND(1)</th>
<th valign="top" align="left" colspan="2">ND(2)</th>
<th valign="top" align="left" colspan="2">IID</th>
<th valign="top" align="left" colspan="2">ND(1)</th>
<th valign="top" align="left" colspan="2">ND(2)</th>
<th valign="top" align="left" colspan="2">IID</th>
<th valign="top" align="left" colspan="2">ND(1)</th>
<th valign="top" align="left" colspan="2">ND(2)</th>
<th valign="top" align="left" colspan="2">IID</th>
</tr>
 <tr>
<th/>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
<th valign="top" align="center">R&#x00023;</th>
<th valign="top" align="center">S&#x02191;</th>
</tr>
<tr>
<th valign="top" align="center" colspan="19">Target: 21%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FedAvg</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">19</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">16</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">23</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">19</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">26</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">17</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">FedNova</td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">0.79</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">19</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">23</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">19</td>
<td valign="top" align="center">0.89</td>
</tr>
<tr>
<td valign="top" align="left">FedProx</td>
<td valign="top" align="center">9</td>
<td valign="top" align="center">1.44</td>
<td valign="top" align="center">31</td>
<td valign="top" align="center">0.61</td>
<td valign="top" align="center">8</td>
<td valign="top" align="center">1.25</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">1.45</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">1.30</td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">1.27</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">1.31</td>
</tr>
<tr>
<td valign="top" align="left">FedPer</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">25</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">14</td>
<td valign="top" align="center">1.14</td>
<td valign="top" align="center">30</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">23</td>
<td valign="top" align="center">0.74</td>
<td valign="top" align="center">34</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">23</td>
<td valign="top" align="center">0.74</td>
</tr>
<tr>
<td valign="top" align="left">FedMIX</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">1.18</td>
<td valign="top" align="center">28</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">1.23</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">1.18</td>
<td valign="top" align="center">16</td>
<td valign="top" align="center">1.19</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">22</td>
<td valign="top" align="center">0.77</td>
</tr>
<tr>
<td valign="top" align="left">FedYolo</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">1.18</td>
<td valign="top" align="center">21</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">1.07</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">1.08</td>
<td valign="top" align="center">19</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">28</td>
<td valign="top" align="center">0.93</td>
<td valign="top" align="center">16</td>
<td valign="top" align="center">1.06</td>
</tr>
<tr>
<td valign="top" align="left">ACSFed</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">1.30</td>
<td valign="top" align="center">23</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">1.23</td>
<td valign="top" align="center">26</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">1.18</td>
<td valign="top" align="center">17</td>
<td valign="top" align="center">1.12</td>
<td valign="top" align="center">31</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">14</td>
<td valign="top" align="center">1.21</td>
</tr>
<tr>
<td valign="top" align="left">FL-SD</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">1.30</td>
<td valign="top" align="center">22</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">9</td>
<td valign="top" align="center">1.11</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">1.33</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">1.30</td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">1.27</td>
<td valign="top" align="center">27</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">14</td>
<td valign="top" align="center">1.21</td>
</tr>
<tr>
<td valign="top" align="left">FL-DC</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">1.08</td>
<td valign="top" align="center">27</td>
<td valign="top" align="center">0.70</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">14</td>
<td valign="top" align="center">1.14</td>
<td valign="top" align="center">29</td>
<td valign="top" align="center">0.79</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">1.08</td>
<td valign="top" align="center">18</td>
<td valign="top" align="center">1.06</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">17</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">FL-JSDDC</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">2.6</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">1.58</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">2.5</td>
<td valign="top" align="center">6</td>
<td valign="top" align="center">2.66</td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">1.53</td>
<td valign="top" align="center">6</td>
<td valign="top" align="center">2.17</td>
<td valign="top" align="center">9</td>
<td valign="top" align="center">2.11</td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">1.73</td>
<td valign="top" align="center">6</td>
<td valign="top" align="center">2.83</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Methods that fail to reach the target are marked with &#x0201C;-&#x0201D;.</p>
</table-wrap-foot>
</table-wrap>
<sec>
<label>5.6.1</label>
<title>Evaluation of FL-JSDDC on convergence speed</title>
<p><xref ref-type="table" rid="T1">Table 1</xref> compares the number of communication rounds (R&#x00023;) required for FL-JSDDC and seven baseline methods (FedAvg, FedNova, FedProx, FedPer, FedMIX, FedYolo and ACSFed), along with two ablation variants, FL-SD and FL-DC to reach the target detection performance of 21%. All experiments are conducted on the VisDrone2019-DET dataset under three data distribution settings: <bold>IID</bold>, <bold>ND(1)</bold>(mild non-IID), and <bold>ND(2)</bold>(severe non-IID).</p>
<p>In the IID setting, data is uniformly distributed, and local datasets are similar across clients. As a result, the divergence between local and global models is minimal, yielding robust training. For instance, with 15 clients, FedAvg requires 13 rounds to reach the target detection performance due to its simple weighted averaging strategy, which fails to correct client model deviations. FedNova slightly improves convergence by normalizing updates, but still needs 15 rounds. FedProx introduces a proximal term to restrict update deviation, reducing the round count to 10, but inconsistencies in update directions remain. FedPer adjusts global updates for better robustness, yet converges in 12 rounds, on par with FedAvg. FedMIX leverages a mixing strategy to fuse local and global models, achieving modest improvement with 11 rounds. FedYolo and ACSFed achieve convergence in 12 and 11 rounds respectively under IID settings, but their advantages in generalization and adaptive selection are less pronounced due to the statistical similarity among clients.</p>
<p>The ablation study conducted by FL-SD focuses solely on self-distillation, demonstrating convergence within 10 rounds under IID settings, suggesting that incorporating soft guidance from the global model facilitates convergence. However, the absence of explicit constraints on model drift impedes FL-SD from achieving optimal speed. On the other hand, FL-DC, which exclusively implements drift compensation, achieves convergence in 12 rounds, effectively mitigating drift but lacking personalized feature learning. Integration of both mechanisms in FL-JSDDC yields superior detection performance, reaching the target in only 6 rounds. This outcome underscores the significance of combining self-distillation and drift compensation for rapid and robust convergence. A comparison between FL-SD and FL-JSDDC underscores the substantial enhancement in global alignment through drift correction, while the disparity between FL-DC and FL-JSDDC underscores the critical role of knowledge transfer in ensuring resilient representation learning.</p>
<p>To further validate the robustness of our method, we conduct ablation studies by separately evaluating: FL-DC, which applies only drift compensation; FL-SD, which applies only self-distillation; and the full FL-JSDDC framework that combines both modules.</p>
<p>In the context of ND(1) settings involving 15 participating clients exhibiting moderate heterogeneity, the convergence speed deteriorates across all baseline methods. Specifically, FedAvg, FedNova, and FedPer necessitate 16, 19, and 14 rounds, respectively, while FedProx and FedMIX demonstrate slightly improved performance, requiring 11 and 13 rounds, respectively. FedYolo and ACSFed converge in 15 and 13 rounds, respectively. The escalation in round requirements for convergence with even minor heterogeneity underscores the challenge. Notably, FL-SD and FL-DC exhibit comparable performance to the aforementioned baselines, suggesting limited efficacy of a singular approach. In contrast, the integration of FL-JSDDC, incorporating both modules, achieves notably enhanced convergence efficiency.</p>
<p>In the ND(2) scenario involving 15 client participants with diverse data distributions, convergence performance notably deteriorates across established methods. Specifically, both FedAvg and FedPer necessitate 23 and 30 communication rounds, respectively, to attain the desired detection performance level (21% mAP&#x00040;50). FedYolo achieves convergence in 24 rounds, while ACSFed does so in 26 rounds. Conversely, FedNova, FedProx, and FedMIX fall short of meeting the performance target within the 50 round threshold, underscoring their vulnerability to data distribution imbalances. These findings underscore the substantial impediment posed by significant data heterogeneity on model aggregation and training resilience. Notably, FL-JSDDC accomplishes the performance goal in a mere 15 rounds, showcasing exceptional adaptability and robustness in non-IID settings.</p>
<p>The detection performance is influenced by the scale of clients. A smaller number of clients, such as 5, leads to less fragmented data, lower heterogeneity, and quicker convergence across all methods. Notably, FL-JSDDC demonstrates the fastest convergence under these circumstances. Conversely, with 35 clients, data fragmentation increases, leading to higher heterogeneity and decreased convergence efficiency. Most baseline methods experience notable slowdowns, particularly FedAvg and FedNova. Even in this challenging scenario, FL-JSDDC maintains its superior speed and robustness. This resilience is attributed to its design, wherein self-distillation allows clients to extract globally aligned features from limited local data, and drift compensation regulates local deviations from the global model. These mechanisms enable FL-JSDDC to uphold synchronization and robust convergence, even in demanding conditions characterized by a large number of clients and significant data heterogeneity.</p></sec>
<sec>
<label>5.6.2</label>
<title>Evaluation of FL-JSDDC on detection performance</title>
<p><xref ref-type="table" rid="T2">Table 2</xref> reports the best detection performance achieved by FL-JSDDC under various settings and compares it against multiple baseline methods. Experiments are conducted on the VisDrone2019-DET dataset across different client scales (5, 15, 35) and data distribution settings (IID, ND(1), ND(2)).</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Final detection performance (mAP&#x00040;50) of different algorithms under various client scales and data heterogeneity settings.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="left" colspan="3">5 Clients</th>
<th valign="top" align="left" colspan="3">15 Clients</th>
<th valign="top" align="left" colspan="3">35 Clients</th>
</tr>
<tr>
<td/>
<th valign="top" align="center">ND(1)</th>
<th valign="top" align="center">ND(2)</th>
<th valign="top" align="center">IID</th>
<th valign="top" align="center">ND(1)</th>
<th valign="top" align="center">ND(2)</th>
<th valign="top" align="center">IID</th>
<th valign="top" align="center">ND(1)</th>
<th valign="top" align="center">ND(2)</th>
<th valign="top" align="center">IID</th>
</tr>
</thead>
<tbody>
 <tr>
<td valign="top" align="left">FedAvg</td>
<td valign="top" align="center">23.21%</td>
<td valign="top" align="center">21.17%</td>
<td valign="top" align="center">28.44%</td>
<td valign="top" align="center">22.18%</td>
<td valign="top" align="center">19.51%</td>
<td valign="top" align="center">26.71%</td>
<td valign="top" align="center">18.64%</td>
<td valign="top" align="center">16.23%</td>
<td valign="top" align="center">24.32%</td>
</tr>
<tr>
<td valign="top" align="left">FedNova</td>
<td valign="top" align="center">24.90%</td>
<td valign="top" align="center">21.51%</td>
<td valign="top" align="center">29.12%</td>
<td valign="top" align="center">24.43%</td>
<td valign="top" align="center">19.35%</td>
<td valign="top" align="center">27.24%</td>
<td valign="top" align="center">19.73%</td>
<td valign="top" align="center">15.02%</td>
<td valign="top" align="center">25.49%</td>
</tr>
<tr>
<td valign="top" align="left">FedProx</td>
<td valign="top" align="center">25.17%</td>
<td valign="top" align="center">22.32%</td>
<td valign="top" align="center">29.94%</td>
<td valign="top" align="center">23.97%</td>
<td valign="top" align="center">18.50%</td>
<td valign="top" align="center">27.79%</td>
<td valign="top" align="center">21.01%</td>
<td valign="top" align="center">14.94%</td>
<td valign="top" align="center">25.18%</td>
</tr>
<tr>
<td valign="top" align="left">FedPer</td>
<td valign="top" align="center">24.91%</td>
<td valign="top" align="center">21.35%</td>
<td valign="top" align="center">28.12%</td>
<td valign="top" align="center">23.56%</td>
<td valign="top" align="center">19.22%</td>
<td valign="top" align="center">27.68%</td>
<td valign="top" align="center">20.55%</td>
<td valign="top" align="center">15.76%</td>
<td valign="top" align="center">25.05%</td>
</tr>
<tr>
<td valign="top" align="left">FedMIX</td>
<td valign="top" align="center">25.84%</td>
<td valign="top" align="center">21.27%</td>
<td valign="top" align="center">29.32%</td>
<td valign="top" align="center">25.05%</td>
<td valign="top" align="center">19.43%</td>
<td valign="top" align="center">27.53%</td>
<td valign="top" align="center">20.72%</td>
<td valign="top" align="center">16.98%</td>
<td valign="top" align="center">25.28%</td>
</tr>
<tr>
<td valign="top" align="left">FedYolo</td>
<td valign="top" align="center">23.84%</td>
<td valign="top" align="center">20.83%</td>
<td valign="top" align="center">28.72%</td>
<td valign="top" align="center">22.12%</td>
<td valign="top" align="center">19.58%</td>
<td valign="top" align="center">26.83%</td>
<td valign="top" align="center">19.21%</td>
<td valign="top" align="center">16.44%</td>
<td valign="top" align="center">24.76%</td>
</tr>
<tr>
<td valign="top" align="left">ACSFed</td>
<td valign="top" align="center">23.37%</td>
<td valign="top" align="center">21.71%</td>
<td valign="top" align="center">29.34%</td>
<td valign="top" align="center">22.04%</td>
<td valign="top" align="center">19.96%</td>
<td valign="top" align="center">27.10%</td>
<td valign="top" align="center">19.92%</td>
<td valign="top" align="center">16.41%</td>
<td valign="top" align="center">25.01%</td>
</tr>
<tr>
<td valign="top" align="left">FL-SD</td>
<td valign="top" align="center">25.57%</td>
<td valign="top" align="center">22.16%</td>
<td valign="top" align="center">29.78%</td>
<td valign="top" align="center">25.56%</td>
<td valign="top" align="center">20.06%</td>
<td valign="top" align="center">28.21%</td>
<td valign="top" align="center">21.18%</td>
<td valign="top" align="center">17.75%</td>
<td valign="top" align="center">26.52%</td>
</tr>
<tr>
<td valign="top" align="left">FL-DC</td>
<td valign="top" align="center">23.47%</td>
<td valign="top" align="center">21.91%</td>
<td valign="top" align="center">28.94%</td>
<td valign="top" align="center">23.75%</td>
<td valign="top" align="center">19.18%</td>
<td valign="top" align="center">26.48%</td>
<td valign="top" align="center">18.33%</td>
<td valign="top" align="center">15.76%</td>
<td valign="top" align="center">25.03%</td>
</tr>
<tr>
<td valign="top" align="left">FL-JSDDC</td>
<td valign="top" align="center">29.15%</td>
<td valign="top" align="center">24.91%</td>
<td valign="top" align="center">32.61%</td>
<td valign="top" align="center">27.92%</td>
<td valign="top" align="center">21.29%</td>
<td valign="top" align="center">30.19%</td>
<td valign="top" align="center">25.69%</td>
<td valign="top" align="center">20.27%</td>
<td valign="top" align="center">28.49%</td>
</tr></tbody>
</table>
</table-wrap>
<p>In the IID scenario with uniformly distributed client data, model convergence exhibits greater resilience and leads to enhanced overall detection performance. For example, when considering 5 clients, FL-JSDDC demonstrates an mAP&#x00040;50 of 32.61%, showcasing a significant improvement over FedAvg (28.44%), and FedProx (29.94%). Additionally, FedYolo and ACSFed achieve 28.72% and 29.34%, respectively. As the client count increases, FL-JSDDC sustains strong detection performance, achieving 28.49% with 35 clients, surpassing FedAvg (24.32%) and FedNova (25.49%). These results underscore not only the algorithm&#x00027;s robustness under balanced data distributions but also its scalability.</p>
<p>In the ND(1) scenario involving 5 participating clients with moderate data heterogeneity, conventional methods exhibit significant performance deterioration. Specifically, FedAvg and FedNova experience a decrease in mAP&#x00040;50 to 23.21% and 24.90%, respectively, while FedYolo and ACSFed achieve 23.84% and 23.37%, respectively, showing a further decline as the number of clients increases. In contrast, FL-JSDDC demonstrates consistent detection performance, attaining 29.15% with 5 clients and 27.92% with 15 clients. These findings suggest that FL-JSDDC effectively mitigates the effects of mild data heterogeneity and sustains robust detection performance across varying numbers of clients.</p>
<p>In the ND(2) scenario characterized by heightened heterogeneity, conventional methods exhibit notable declines in detection accuracy. Specifically, when dealing with 35 clients, FedAvg and FedNova achieve only 16.23% and 15.02% accuracy, while FedProx and FedPer drop to 14.94% and 15.76%, respectively. In contrast, FedYolo and ACSFed attain 16.44% and 16.41% accuracy, respectively, in the same setting. Despite these formidable challenges, FL-JSDDC stands out by achieving 20.27% accuracy in this extreme scenario, surpassing all baseline methods and showcasing remarkable resilience to highly non-IID conditions.</p>
<p>From the perspective of client scale, increasing the number of clients from 5 to 35 leads to more fragmented data and greater training difficulty. While most baseline methods suffer significant detection performance degradation, FL-JSDDC maintains a high level of detection performance. For instance, FL-JSDDC achieves 24.91% mAP&#x00040;50 with 5 clients and still retains 20.27% with 35 clients, markedly outperforming all other methods. This result further validates its ability to cope with the increased heterogeneity introduced by larger client populations.</p>
<p>Overall, FL-JSDDC consistently delivers superior detection performance across all data partition settings and client scales. This improvement is primarily attributed to the synergistic effects of its self-distillation and drift compensation mechanisms. The self-distillation component guides local models to learn globally consistent features, improving their adaptability. Meanwhile, the drift compensation component effectively suppresses excessive local model drift, enhancing the robustness of global aggregation. Together, these mechanisms enable FL-JSDDC to maintain high detection performance and robust convergence even in challenging federated environments, making it a promising solution to the client drift problem in federated object detection.</p>
<p>To further demonstrate the convergence behavior and robustness of the proposed method, <xref ref-type="fig" rid="F4">Figure 4</xref> presents the detection performance curves of FL-JSDDC and baseline methods over 50 communication rounds under three data heterogeneity settings. All results are obtained under a federated configuration involving 5 participating clients. In the IID setting (<xref ref-type="fig" rid="F4">Figure 4a</xref>), where client data distributions are balanced, FL-JSDDC rapidly achieves high detection performance, outperforming other methods from the early stages of training.</p>
<p>Under the ND(1) setting (<xref ref-type="fig" rid="F4">Figure 4b</xref>), where client distributions slightly differ, FL-JSDDC still maintains a clear advantage in both convergence speed and robustness, while FedAvg and FedNova exhibit slower and more volatile trends.</p>
<p>In the most challenging ND(2) scenario (<xref ref-type="fig" rid="F4">Figure 4c</xref>), where class imbalance across clients is severe, baseline methods suffer significant detection performance degradation and stagnation. However, FL-JSDDC demonstrates strong resilience, achieving higher detection performance with faster convergence, validating the robustness of its joint self-distillation and drift compensation design in addressing model drift and local overfitting in non-IID environments.</p>
</sec>
</sec>
<sec>
<label>5.7</label>
<title>Discussion</title>
<p>In summarizing the simulation outcomes, FL-JSDDC exhibits distinct advantages in convergence speed and detection performance across varied data heterogeneity scenarios. In comparison to standard methods, FL-JSDDC not only attains quicker convergence (e.g., achieving target detection performance within 3-5 rounds) but also sustains notably superior detection performance amidst escalating client numbers or data imbalance levels.</p>
<p>The enhancements primarily stem from the synergistic operation of two key modules: the self-distillation mechanism enhances the preservation of personalized local knowledge while conforming to global semantics, thereby enhancing adaptability. Simultaneously, the drift compensation mechanism adeptly mitigates local model drift, ensuring a more stable global aggregation. This dual framework demonstrates notable resilience in highly non-IID scenarios, where conventional federated object detection methods experience sluggish convergence and diminished detection performance.</p>
<p>Moreover, as the number of clients increases from 5 to 35, FL-JSDDC maintains consistently high detection performance, confirming its scalability. These results underscore the potential of the proposed framework for practical implementation in distributed UAV networks, where client diversity and limited resources are prevalent.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="s6">
<label>6</label>
<title>Conclusion</title>
<p>In this study, we introduce FL-JSDDC, a federated UAV object detection framework designed for non-IID data scenarios. Through the incorporation of self-distillation and drift compensation mechanisms, FL-JSDDC enhances model detection accuracy, convergence rate, and performance on decentralized client devices. Empirical findings demonstrate the superior performance of FL-JSDDC compared to current federated learning approaches, particularly in non-IID settings. Subsequent research endeavors will concentrate on customizing FL-JSDDC for dynamic client engagement and investigating model compression methods to facilitate its implementation in resource limited environments.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>YH: Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. CJ: Writing &#x02013; original draft. ZZ: Writing &#x02013; original draft. HO: Writing &#x02013; original draft. PC: Writing &#x02013; original draft.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Arivazhagan</surname> <given-names>M. G.</given-names></name> <name><surname>Aggarwal</surname> <given-names>V.</given-names></name> <name><surname>Singh</surname> <given-names>A. K.</given-names></name> <name><surname>Choudhary</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>Federated learning with personalization layers</article-title>. <source>arXiv</source> [preprint] arXiv:1912.00818. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1912.00818</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Awaysheh</surname> <given-names>F. M.</given-names></name> <name><surname>Rahimli</surname> <given-names>L.</given-names></name> <name><surname>Al Zubi</surname> <given-names>S.</given-names></name> <name><surname>Alawadi</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <source>Federated Learning Drift Detection: An Empirical Study on the Impact of Concept and Data Drift</source>. <publisher-loc>New York, NY</publisher-loc>: <publisher-name>IEEE</publisher-name>.</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>A.</given-names></name> <name><surname>Fu</surname> <given-names>Y.</given-names></name> <name><surname>Sha</surname> <given-names>Z.</given-names></name> <name><surname>Lu</surname> <given-names>G.</given-names></name></person-group> (<year>2022</year>). <article-title>An EMD-based adaptive client selection algorithm for federated learning in heterogeneous data scenarios</article-title>. <source>Front Plant Sci</source>. <volume>13</volume>:<fpage>908814</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpls.2022.908814</pub-id><pub-id pub-id-type="pmid">35755701</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><collab>FedTest</collab> (<year>2025</year>). <article-title>A new scheme to enhance convergence rate and robustness in federated learning</article-title>. <source>arXiv</source> [preprint] arXiv:2501.11167. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2501.11167</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fu</surname> <given-names>M.</given-names></name> <name><surname>Shi</surname> <given-names>Y.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Federated learning via unmanned aerial vehicle</article-title>. <source>arXiv</source> [preprint] arXiv:2210.10970. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2210.10970</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hafeez</surname> <given-names>S.</given-names></name> <name><surname>Mohjazi</surname> <given-names>L.</given-names></name> <name><surname>Imran</surname> <given-names>M. A.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Blockchain-enabled Clustered and Scalable Federated Learning (BCS-FL) framework in UAV networks</article-title>. <source>arXiv</source> [preprint] arXiv:2402.05973. doi: <pub-id pub-id-type="doi">10.1109/CAMAD59638.2023.10478423</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>K.</given-names></name> <name><surname>Xiang</surname> <given-names>L.</given-names></name> <name><surname>Tang</surname> <given-names>P.</given-names></name> <name><surname>Qiu</surname> <given-names>W.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Feature norm regularized federated learning: utilizing data disparities for model performance gains,&#x0201D;</article-title> in <source>Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence (IJCAI-24)</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>International Joint Conferences on Artificial Intelligence (IJCAI))</publisher-name>.</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khazaei</surname> <given-names>E.</given-names></name> <name><surname>Taha</surname> <given-names>B.</given-names></name> <name><surname>Esmaeilzehi</surname> <given-names>A.</given-names></name> <name><surname>Hatzinakos</surname> <given-names>D.</given-names></name></person-group> (<year>2025</year>). <article-title>&#x0201C;OSR: toward developing efficient federated learning-based human activity recognition using optimal server representations,&#x0201D;</article-title> in <source>ICASSP 2025</source> - <italic>IEEE International Conference on Acoustics, Speech and Signal Processing</italic> (Hyderabad: IEEE).</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>T.</given-names></name> <name><surname>Sahu</surname> <given-names>A. K.</given-names></name> <name><surname>Talwalkar</surname> <given-names>A.</given-names></name> <name><surname>Smith</surname> <given-names>V.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Federated optimization in heterogeneous networks,&#x0201D;</article-title> in <source>Proceedings of Machine Learning and Systems (MLSys)</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Proceedings of Machine Learning and Systems (MLSys</publisher-name>)).</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Yuan</surname> <given-names>X.</given-names></name> <name><surname>Xiong</surname> <given-names>Z.</given-names></name> <name><surname>Kang</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Niyato</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>Federated learning for 6G communications: challenges, methods, and future directions</article-title>. <source>IEEE Netw</source>. <volume>35</volume>, <fpage>244</fpage>&#x02013;<lpage>251</lpage>. doi: <pub-id pub-id-type="doi">10.1109/MNET.011.2000842</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Mehta</surname> <given-names>S.</given-names></name> <name><surname>Aneja</surname> <given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Securing data privacy in machine learning: the FedAvg of federated learning approach,&#x0201D;</article-title> in <source>Proceedings of the 2024 4th Asian Conference on Innovation in Technology (ASIANCON)</source> (<publisher-loc>Pimari</publisher-loc>: <publisher-name>IEEE</publisher-name>).</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nguyen</surname> <given-names>D. C.</given-names></name> <name><surname>Ding</surname> <given-names>M.</given-names></name> <name><surname>Pathirana</surname> <given-names>P. N.</given-names></name> <name><surname>Seneviratne</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>Federated learning for Internet of Things: a comprehensive survey</article-title>. <source>IEEE Commun. Surv. Tutorials</source> <volume>23</volume>, <fpage>1622</fpage>&#x02013;<lpage>1658</lpage>. doi: <pub-id pub-id-type="doi">10.1109/COMST.2021.3075439</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pan</surname> <given-names>D.</given-names></name> <name><surname>Khoshkholghi</surname> <given-names>M. A.</given-names></name> <name><surname>Mahmoodi</surname> <given-names>T.</given-names></name></person-group> (<year>2023</year>). <article-title>Decentralized federated learning methods for reducing communication cost and energy consumption in UAV networks</article-title>. <source>arXiv</source> [preprint] arXiv:2304.06551. doi: <pub-id pub-id-type="doi">10.1007/978-3-031-31891-7_2</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Plastiras</surname> <given-names>G.</given-names></name> <name><surname>Kyrkou</surname> <given-names>C.</given-names></name> <name><surname>Theocharides</surname> <given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>Efficient ConvNet-based object detection for unmanned aerial vehicles by selective tile processing</article-title>. <source>arXiv</source> [preprint] arXiv:1911.06073. doi: <pub-id pub-id-type="doi">10.1145/3243394.3243692</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Qu</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>He</surname> <given-names>C.</given-names></name> <name><surname>So</surname> <given-names>J.</given-names></name> <name><surname>Yu</surname> <given-names>F. R.</given-names></name> <name><surname>Song</surname> <given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Federated learning on non-IID data silos: a benchmark study,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems (NeurIPS) 35</source> (<publisher-loc>La Jolla, CA</publisher-loc>: <publisher-name>Neural Information Processing Systems Foundation, Inc.</publisher-name>), <fpage>19872</fpage>&#x02013;<lpage>19887</lpage>.</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rashidi</surname> <given-names>G.</given-names></name> <name><surname>Bounias</surname> <given-names>D.</given-names></name> <name><surname>Bujotzek</surname> <given-names>M.</given-names></name> <name><surname>Mart&#x000ED;nez Mora</surname> <given-names>A.</given-names></name> <name><surname>Neher</surname> <given-names>P.</given-names></name> <name><surname>Maier-Hein</surname> <given-names>K. H.</given-names></name></person-group> (<year>2024</year>). <article-title>The potential of federated learning for self-configuring medical object detection in heterogeneous data distributions</article-title>. <source>Scient. Rep</source>. <volume>14</volume>:<fpage>23844</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-024-74577-0</pub-id><pub-id pub-id-type="pmid">39394440</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>G.</given-names></name> <name><surname>Ni</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Gu</surname> <given-names>Y.</given-names></name> <name><surname>Cao</surname> <given-names>W.</given-names></name></person-group> (<year>2023</year>). <article-title>A survey of object detection for UAVs based on deep learning</article-title>. <source>Remote Sens</source>. <volume>16</volume>:<fpage>149</fpage>. doi: <pub-id pub-id-type="doi">10.3390/rs16010149</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>B.</given-names></name> <name><surname>Feng</surname> <given-names>D.</given-names></name> <name><surname>Su</surname> <given-names>J.</given-names></name> <name><surname>Song</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>An effective federated object detection framework with dynamic differential privacy</article-title>. <source>Mathematics</source> <volume>12</volume>:<fpage>2150</fpage>. doi: <pub-id pub-id-type="doi">10.3390/math12142150</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Q.</given-names></name> <name><surname>Liang</surname> <given-names>H.</given-names></name> <name><surname>Joshi</surname> <given-names>G.</given-names></name> <name><surname>Poor</surname> <given-names>H. V.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Tackling the objective inconsistency problem in heterogeneous federated optimization,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> (<publisher-loc>La Jolla, CA</publisher-loc>: <publisher-name>Neural Information Processing Systems Foundation, Inc.</publisher-name>), <fpage>7611</fpage>&#x02013;<lpage>7623</lpage>.</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>A.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Mo</surname> <given-names>Y.</given-names></name> <name><surname>Xiang</surname> <given-names>S.</given-names></name> <name><surname>Duan</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>EUAVDet: an efficient and lightweight object detector for UAV aerial images with an edge-based computing platform</article-title>. <source>Drones</source> <volume>8</volume>:<fpage>261</fpage>. doi: <pub-id pub-id-type="doi">10.3390/drones8060261</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <name><surname>Hong</surname> <given-names>D.</given-names></name> <name><surname>Tao</surname> <given-names>R.</given-names></name> <name><surname>Du</surname> <given-names>Q.</given-names></name></person-group> (<year>2021</year>). <article-title>Deep learning for UAV based object detection and tracking: a survey</article-title>. <source>IEEE Geosci. Remote Sens. Magaz</source>. <volume>10</volume>, <fpage>91</fpage>&#x02013;<lpage>124</lpage>. doi: <pub-id pub-id-type="doi">10.1109/MGRS.2021.3115137</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Sui</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>G.</given-names></name></person-group> (<year>2017</year>). <article-title>Vision-based real-time aerial object localization and tracking for UAV sensing system</article-title>. <source>IEEE Access</source> <volume>5</volume>, <fpage>2281</fpage>&#x02013;<lpage>2291</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2017.2764419</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>R.</given-names></name> <name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Zhou</surname> <given-names>X.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Dong</surname> <given-names>Z.</given-names></name></person-group> (<year>2024</year>). <article-title>Differentially private federated learning for multitask objective recognition</article-title>. <source>IEEE Trans. Indust. Informat</source>. <volume>20</volume>:<fpage>3767</fpage>&#x02013;<lpage>3777</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TII.2023.3342897</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>R.</given-names></name> <name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Zhou</surname> <given-names>X.</given-names></name> <name><surname>Dong</surname> <given-names>Z.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Asynchronous federated learning for real-time multiple licence plate recognition through semantic communication,&#x0201D;</article-title> in <source>ICASSP 2023</source> - <italic>IEEE International Conference on Acoustics, Speech and Signal Processing</italic> (Rhodes Island: IEEE).</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yoon</surname> <given-names>T.</given-names></name> <name><surname>Shin</surname> <given-names>S.</given-names></name> <name><surname>Hwang</surname> <given-names>S. J.</given-names></name> <name><surname>Yang</surname> <given-names>E.</given-names></name></person-group> (<year>2021</year>). <article-title>FedMix: approximation of mixup under mean augmented federated learning</article-title>. <source>arXiv</source> [preprint] arXiv:2107.00233. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2107.00233</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhagypar</surname> <given-names>R.</given-names></name> <name><surname>Kouzayha</surname> <given-names>N.</given-names></name> <name><surname>ElSawy</surname> <given-names>H.</given-names></name> <name><surname>Dahrouj</surname> <given-names>H.</given-names></name> <name><surname>Al-Naffouri</surname> <given-names>T. Y</given-names></name></person-group>. (<year>2022</year>). <article-title>Characterization of the global bias problem in aerial federated learning</article-title>. <source>arXiv</source> [preprint] arXiv:2212.14360. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2212.14360</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Chang</surname> <given-names>X.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Roy-Chowdhury</surname> <given-names>A. K.</given-names></name> <name><surname>Suresh</surname> <given-names>A. T.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>FedYolo: augmenting federated learning with pretrained transformers</article-title>. <source>arXiv</source> [preprint] arXiv:2307.04905. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2307.04905</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Ma</surname> <given-names>S.</given-names></name> <name><surname>Yang</surname> <given-names>Z.</given-names></name> <name><surname>Xiong</surname> <given-names>Z.</given-names></name> <name><surname>Kang</surname> <given-names>J.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Robust semi-supervised federated learning for images automatic recognition in internet of drones</article-title>. <source>arXiv</source> [preprint] arXiv:2201.01230. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2201.01230</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>T.</given-names></name> <name><surname>Li</surname> <given-names>A.</given-names></name> <name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name> <name><surname>Luo</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>AutoFed: heterogeneity-aware federated multimodal learning for robust autonomous driving</article-title>. <source>arXiv</source> [preprint] arXiv:2302.08646. doi: <pub-id pub-id-type="doi">10.1145/3570361.3592517</pub-id></mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>P.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Dai</surname> <given-names>X.</given-names></name> <name><surname>Yuan</surname> <given-names>Y.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;Vision meets drones: a challenge,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)</source> (<publisher-loc>Montreal, BC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2661</fpage>&#x02013;<lpage>2670</lpage>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>X.</given-names></name> <name><surname>Lyu</surname> <given-names>S.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Zhao</surname> <given-names>Q.</given-names></name></person-group> (<year>2021a</year>). <article-title>&#x0201C;TPH-YOLOv5: improved YOLOv5 based on transformer prediction head for object detection on drone-captured scenarios,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops</source> (<publisher-loc>Montreal, BC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2778</fpage>&#x02013;<lpage>2788</lpage>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>X.</given-names></name> <name><surname>Lyu</surname> <given-names>S.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Zhao</surname> <given-names>Q.</given-names></name></person-group> (<year>2021b</year>). <article-title>TPH-YOLOv5: improved YOLOv5 based on transformer prediction head for object detection on drone-captured scenarios</article-title>. <source>arXiv</source> [preprint] arXiv:2108.11539. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2108.11539</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/42313/overview">Alois C. Knoll</ext-link>, Technical University of Munich, Germany</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1687533/overview">Arsalan Muhammad Soomar</ext-link>, Gdansk University of Technology, Poland</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3125174/overview">Neethu Subash</ext-link>, National Institute of Technology, Tiruchirappalli, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3125319/overview">Vinoth K.</ext-link>, SRM University, India</p>
</fn>
</fn-group>
</back>
</article>