<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2024.1521008</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>An improved ShuffleNetV2 method based on ensemble self-distillation for tomato leaf diseases recognition</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Ni</surname>
<given-names>Shuiping</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Jia</surname>
<given-names>Yue</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2866886"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhu</surname>
<given-names>Mingfu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Yizhe</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Wendi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Shangxin</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Yawei</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Computer Science and Technology, Henan Polytechnic University</institution>, <addr-line>Jiaozuo</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Research and Development Department, Henan Chuitian Technology Corporation Limited</institution>, <addr-line>Hebi</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Tika Adhikari, North Carolina State University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Milind Ratnaparkhe, ICAR Indian Institute of Soybean Research, India</p>
<p>Hanmi Zhou, Henan University of Science and Technology, China</p>
<p>Jinrong He, Yan'an University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Yue Jia, <email xlink:href="mailto:212309010035@home.hpu.edu.cn">212309010035@home.hpu.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>21</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1521008</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>11</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Ni, Jia, Zhu, Zhang, Wang, Liu and Chen</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Ni, Jia, Zhu, Zhang, Wang, Liu and Chen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Timely and accurate recognition of tomato diseases is crucial for improving tomato yield. While large deep learning models can achieve high-precision disease recognition, these models often have a large number of parameters, making them difficult to deploy on edge devices. To address this issue, this study proposes an ensemble self-distillation method and applies it to the lightweight model ShuffleNetV2.</p>
</sec>
<sec>
<title>Methods</title>
<p>Specifically, based on the architecture of ShuffleNetV2, multiple shallow models at different depths are constructed to establish a distillation framework. Based on the fused feature map that integrates the intermediate feature maps of ShuffleNetV2 and shallow models, a depthwise separable convolution layer is introduced to further extract more effective feature information. This method ensures that the intermediate features from each model are fully preserved to the ensemble model, thereby improving the overall performance of the ensemble model. The ensemble model, acting as the teacher, dynamically transfers knowledge to ShuffleNetV2 and the shallow models during training, significantly enhancing the performance of ShuffleNetV2 without changing the original structure.</p>
</sec>
<sec>
<title>Results</title>
<p>Experimental results show that the optimized ShuffleNetV2 achieves an accuracy of 95.08%, precision of 94.58%, recall of 94.55%, and an F1 score of 94.54% on the test set, surpassing large models such as VGG16 and ResNet18. Among lightweight models, it has the smallest parameter count and the highest recognition accuracy.</p>
</sec>
<sec>
<title>Discussion</title>
<p>The results demonstrate that the optimized ShuffleNetV2 is more suitable for deployment on edge devices for real-time tomato disease detection. Additionally, multiple shallow models achieve varying degrees of compression for ShuffleNetV2, providing flexibility for model deployment.</p>
</sec>
</abstract>
<kwd-group>
<kwd>tomato leaf diseases recognition</kwd>
<kwd>lightweight model</kwd>
<kwd>ShuffleNetV2</kwd>
<kwd>ensemble</kwd>
<kwd>self-distillation</kwd>
<kwd>model compression</kwd>
</kwd-group>
<counts>
<fig-count count="10"/>
<table-count count="8"/>
<equation-count count="9"/>
<ref-count count="31"/>
<page-count count="15"/>
<word-count count="6617"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>In 2022, the global tomato cultivation area was approximately 4.92 million hectares, yielding around 186.11 million tons of produce. However, tomato yields continue to be adversely affected by factors such as climate conditions and pest infestations (<xref ref-type="bibr" rid="B25">Wang et&#xa0;al., 2024</xref>). Various diseases can affect tomato plants at different growth stages, hindering their growth and ultimately leading to reduced yield and lower quality (<xref ref-type="bibr" rid="B12">Liang and Jiang, 2023</xref>). Traditional methods for identifying tomato diseases rely on manual inspection, a process that is both time-consuming and labor-intensive (<xref ref-type="bibr" rid="B17">Pandiyaraju et&#xa0;al., 2024</xref>). With the development of computer vision and deep learning technologies, these manual approaches have been increasingly replaced by automated solutions. However, high-performance models often come with a large number of parameters, which makes them difficult to deploy efficiently on edge devices. This poses a significant challenge for large-scale, real-time detection of tomato diseases (<xref ref-type="bibr" rid="B31">Zhou et&#xa0;al., 2024</xref>). Therefore, developing a more compact model that delivers performance comparable to larger models is essential for achieving both real-time and accurate disease detection in tomatoes.</p>
<p>Methods for identifying leaf diseases based on computer vision are divided into two categories: machine learning methods and deep learning methods. In terms of machine learning (<xref ref-type="bibr" rid="B19">Qin et&#xa0;al., 2016</xref>), applied a segmentation method combining the K-median clustering algorithm with linear discriminant analysis to extract 129 features from lesion images. They then compared the recognition accuracy of three machine learning algorithms: support vector machine (SVM), random forest (RF) and K-nearest neighbor methods. Among them, the optimal SVM model achieved a recognition accuracy of 94.74% on the test set (<xref ref-type="bibr" rid="B18">Patil et&#xa0;al., 2017</xref>). employed two methods to extract features. The first method involved calculating HSV color moments, including mean, variance, skewness, energy, and entropy for each color channel. The second method utilized 6th-order Exact Legendre Moments. The multi-class SVM they proposed achieved an accuracy of 99.1% on a three-class tomato dataset (<xref ref-type="bibr" rid="B15">Meenakshi et&#xa0;al., 2019</xref>). focused their research on potato leaf images to evaluate the effectiveness of various methods for recognizing potato diseases. They compared traditional machine learning techniques with neural networks and found that the artificial neural network achieved a 92% recognition accuracy, significantly outperforming traditional methods like SVM and RF. Despite the simplicity of these machine learning algorithms themselves, manually extracting features is a highly complex process that often requires domain expert knowledge and a significant investment of time. The scale of data that can be processed is very limited (<xref ref-type="bibr" rid="B13">Liu et&#xa0;al., 2024</xref>). In recent years, with the continuous development of deep learning technology, research on the application of deep learning in plant disease recognition has been increasing (<xref ref-type="bibr" rid="B20">Rangarajan et&#xa0;al., 2018</xref>). used deep learning models, AlexNet and VGG16, which were pre-trained on ImageNet, to classify seven types of tomato leaf disease images in the dataset, achieving accuracy rates of 97.29% and 97.49%, respectively (<xref ref-type="bibr" rid="B4">Edna Chebet et&#xa0;al., 2019</xref>). compared state-of-the-art deep learning models for plant disease detection, including VGG16, ResNet50, and DenseNet. They observed that models with greater depth achieved higher accuracy. Among these models, the DenseNet model with 121 layers performed the best, achieving an accuracy of 99.75%. <xref ref-type="bibr" rid="B29">Zhao et&#xa0;al. (2021)</xref> integrated the multi-scale feature extraction module and SE module into ResNet50, significantly enhancing its feature extraction capability and achieving a recognition accuracy of 96.81% on the tomato leaf dataset. <xref ref-type="bibr" rid="B30">Zhou et&#xa0;al. (2021)</xref> restructured RDN model for classification task and achieve 95% recognition accuracy on tomato dataset (<xref ref-type="bibr" rid="B12">Liang and Jiang, 2023</xref>). proposed the ResNet50-DPA model, where cascaded atrous convolution and a dual-path attention mechanism were introduced to obtain features with different scales and to capture key features, respectively. However, in these studies, the high recognition accuracy often depends on deeper network structures, which usually have a large number of parameters and significant memory consumption, making them unsuitable for deployment on resource-constrained small edge devices (<xref ref-type="bibr" rid="B2">Choudhary et&#xa0;al., 2020</xref>).</p>
<p>To address these challenges, We propose an ensemble self-distillation method and apply it to the lightweight model ShuffleNetV2, enabling its performance to reach the level of larger models. Unlike traditional knowledge distillation methods (<xref ref-type="bibr" rid="B8">Hinton et&#xa0;al., 2015</xref>), which rely on one-to-one knowledge transfer from a pre-trained teacher model to a student model, the approach proposed in this paper introduces a teacher model that is an ensemble of multiple student models. The knowledge of this teacher model is dynamically generated during the training process, avoiding the introduction of additional training costs. In contrast to traditional self-distillation methods, which typically use the original model as the teacher in the framework (as shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1A</bold>
</xref>), our method employs an ensemble of the original model and several shallow models as the teacher. This ensemble model is capable of fully integrating the information from each model, thereby providing the student models with richer and more comprehensive knowledge. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1B</bold>
</xref> illustrates a simple ensemble strategy where the logits from the original model and shallow models are averaged (<xref ref-type="bibr" rid="B27">Zhang et&#xa0;al., 2022</xref>) However, this approach neglects the intermediate feature maps from the individual models. Compared to deep feature maps, intermediate feature maps often contain more comprehensive information. Therefore, we further improve the ensemble strategy by integrating the intermediate feature maps from each model and performing deeper feature extraction (as shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1C</bold>
</xref>). In this way, the intermediate feature information from all branches is fully preserved for the ensemble model, allowing the ensemble model to utilize it more effectively to enhance its performance.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Schematic framework of distillation. <bold>(A)</bold> Self-distillation. <bold>(B)</bold> Averaged logits in self-distillation. <bold>(C)</bold> Ensemble self-distillation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g001.tif"/>
</fig>
<p>Specifically, we select ShuffleNetV2 as the student model. Based on its architecture, we build three shallow models at different depths to establish distillation framework. Each shallow models is equipped with unique structures and parameter sizes, mitigating the branch homogeneity issue commonly observed in traditional online knowledge distillation (<xref ref-type="bibr" rid="B5">Gong et&#xa0;al., 2023</xref>). After constructing these shallow models, we fuse the intermediate feature maps from each model. Based on this fused feature map, depthwise separable convolution layers are incorporated to further extract features, improving the performance of the ensemble model without a significant increase in parameter count. Once the distillation framework is established, two regularization terms are introduced: First, the Kullback-Leibler (KL) divergence is applied to constrain the logits of the student models, aligning them more closely with the outputs of the teacher model. Second, the L2 norm (Euclidean distance) is used to regulate the intermediate features of the student models, ensuring greater consistency with those of the teacher model.</p>
<p>The main contributions of this paper are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>We propose an ensemble method that fuses intermediate feature maps of all models within the distillation framework and incorporates depthwise separable convolution layers on the fused feature maps to further extract features, thereby constructing a more effective ensemble model.</p>
</list-item>
<list-item>
<p>We utilize the more effective ensemble model as a teacher to dynamically transfer knowledge to each student model during training. As a student model, the optimized ShuffleNetV2, namely KD-ShuffleNetV2, achieves performance comparable to larger models such as VGG16 and ResNet18, without altering the original architecture, making it more suitable for real-time tomato disease recognition on edge devices.</p>
</list-item>
<list-item>
<p>The shallow models within the framework can be treated as compressed versions of ShuffleNetV2, achieving different levels of compression in terms of parameter count and floating-point operations, providing flexibility for model deployment.</p>
</list-item>
</list>
<p>The rest of this paper is organized as follows: Section 2 introduces the dataset used and the data processing procedure, and provides a detailed description of the proposed distillation method. Section 3 presents various experiments conducted on the proposed method, analyzes the experimental results, and visualizes the model. Section 4 provides a summary of the paper and a discussion on its limitations.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data processing</title>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Image datasets</title>
<p>The dataset used in this study is aggregated from four sources. The first source, Plant Village (<xref ref-type="bibr" rid="B24">SpMohanty, 2018</xref>), provides data samples on tomato leaf diseases from the following 10 categories: healthy(1591), bacterial spot(2127), early blight(1000), late blight(1909), leaf mold(952), septoria leaf spot(1771), yellow leaf curl virus(5357), mosaic virus(373), two-spotted spider mite(1676), and target spot(1404). The second source, Ai Challenger 2018 Crop Leaf Disease Challenge (<xref ref-type="bibr" rid="B3">Dataset AI Challenger, 2018</xref>), contains rich data samples on crop leaf diseases. However, we only use 1,469 samples of the powdery mildew category to fill the missing category in the Plant Village. The third source is PlantDoc (<xref ref-type="bibr" rid="B23">Singh et&#xa0;al., 2020</xref>), containing data samples of the following categories: healthy(62), bacterial spot(106), early blight(88), late blight(111), leaf mold(90), septoria leaf spot(155), yellow leaf curl virus(84), mosaic virus(54). The fourth source is Taiwan Tomato Disease (<xref ref-type="bibr" rid="B11">Huang and Chang, 2020</xref>), which contains data samples categorized as healthy(106), bacterial spot(110), late blight(98), leaf mold(67), powdery mildew(157), gray spot(84). The third and fourth sources contain abund rich ant outdoor samples, which further enhance the diversity of the our dataset. <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> shows examples of different tomato leaf diseases.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Examples of tomato diseases from the datasets.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g002.tif"/>
</fig>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Image preprocessing</title>
<p>The number of samples in the "gray spot" category within the Taiwan Tomato Disease is only 84, which is significantly lower than the number of samples in other categories in the aggregated dataset. Therefore, we excluded the "gray spot" category to achieve a more balanced distribution across categories. On one hand, there is a considerable imbalance in the number of samples across indoor categories. For instance, the mosaic virus, which has the fewest samples, consists of 373 instances, while the yellow leaf curl virus, the category with the most samples, contains 5,357 instances. On the other hand, the total number of indoor samples significantly exceeds that of outdoor samples. To mitigate this imbalance, random sampling was employed for the indoor categories with larger sample sizes, aiming to better balance the distribution both within the indoor categories and between indoor and outdoor data. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> summarizes the detailed information of the processed dataset used in this study. It should be noted that there are no outdoor samples available for two-spotted spider mite and target spot categories.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Summary of main datasets used in the study.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Dataset</th>
<th valign="middle" colspan="2" align="center">Plant Village</th>
<th valign="middle" colspan="2" align="center">AI Challenger 2018</th>
<th valign="middle" colspan="2" align="center">PlantDoc</th>
<th valign="middle" colspan="2" align="center">Taiwan</th>
<th valign="top" rowspan="2" align="center">Total</th>
</tr>
<tr>
<th valign="middle" align="center">Class</th>
<th valign="middle" align="center">Indoor</th>
<th valign="middle" align="center">Outdoor</th>
<th valign="middle" align="center">Indoor</th>
<th valign="middle" align="center">Outdoor</th>
<th valign="middle" align="center">Indoor</th>
<th valign="middle" align="center">Outdoor</th>
<th valign="middle" align="center">Indoor</th>
<th valign="middle" align="center">Outdoor</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">bacterial spot</td>
<td valign="middle" align="center">1400</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">14</td>
<td valign="middle" align="center">96</td>
<td valign="middle" align="center">94</td>
<td valign="middle" align="center">16</td>
<td valign="middle" align="center">1620</td>
</tr>
<tr>
<td valign="middle" align="center">early blight</td>
<td valign="middle" align="center">1000</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">82</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">1088</td>
</tr>
<tr>
<td valign="middle" align="center">healthy</td>
<td valign="middle" align="center">1100</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">24</td>
<td valign="middle" align="center">28</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">106</td>
<td valign="middle" align="center">1258</td>
</tr>
<tr>
<td valign="middle" align="center">late blight</td>
<td valign="middle" align="center">1200</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">20</td>
<td valign="middle" align="center">91</td>
<td valign="middle" align="center">40</td>
<td valign="middle" align="center">58</td>
<td valign="middle" align="center">1409</td>
</tr>
<tr>
<td valign="middle" align="center">leaf mold</td>
<td valign="middle" align="center">952</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">85</td>
<td valign="middle" align="center">63</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">1110</td>
</tr>
<tr>
<td valign="middle" align="center">powdery dildew</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">1200</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">18</td>
<td valign="middle" align="center">139</td>
<td valign="middle" align="center">1357</td>
</tr>
<tr>
<td valign="middle" align="center">septoria leaf spot</td>
<td valign="middle" align="center">1150</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">29</td>
<td valign="middle" align="center">128</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">1307</td>
</tr>
<tr>
<td valign="middle" align="center">two-spotted spider mite</td>
<td valign="middle" align="center">1676</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">1676</td>
</tr>
<tr>
<td valign="middle" align="center">target spot</td>
<td valign="middle" align="center">1404</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">1404</td>
</tr>
<tr>
<td valign="middle" align="center">mosaic virus</td>
<td valign="middle" align="center">373</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">48</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">427</td>
</tr>
<tr>
<td valign="middle" align="center">yellow leaf curl virus</td>
<td valign="middle" align="center">1900</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">231</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">2138</td>
</tr>
<tr>
<td valign="middle" align="center">Total</td>
<td valign="middle" align="center">12155</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">1200</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">112</td>
<td valign="middle" align="center">789</td>
<td valign="middle" align="center">215</td>
<td valign="middle" align="center">323</td>
<td valign="middle" align="center">14794</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The data is resized to 64&#xd7;64, and the entire dataset is split into training and testing sets in a 7:3 ratio. To enhance the diversity of the dataset, improving the model's generalization ability, data augmentation techniques, such as horizontal flipping, random rotation, brightness enhancement, contrast enhancement, etc., are applied exclusively to the training set during the model training phase. The effects of data augmentation are illustrated in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. The impact of data augmentation on the model's performance on the test set will be discussed in section 3.4.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Data augmentation for training set <bold>(A)</bold> Original image. <bold>(B)</bold> Horizontal flip. <bold>(C)</bold> Random Rotate. <bold>(D)</bold> Brightness change. <bold>(E)</bold> Contrast change. <bold>(F)</bold> Add noise.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g003.tif"/>
</fig>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>The proposed method</title>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>ShuffleNetV2 model</title>
<p>
<xref ref-type="bibr" rid="B14">Ma et&#xa0;al. (2018)</xref> introduced a more lightweight ShuffleNetV2 unit, building on the ShuffleNetV1 architecture (<xref ref-type="bibr" rid="B28">Zhang et&#xa0;al., 2018</xref>), as depicted in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>. The ShuffleNetV2 unit encompasses two variants: the ShuffleNet Unit (SNU) and the Downsample ShuffleNet Unit (D-SNU). The SNU, illustrated in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4A</bold>
</xref>, divides the input feature map channels into two branches, where the left branch remains unaltered, and the right branch employs a 1&#xd7;1 standard convolution followed by a 3&#xd7;3 depthwise convolution, concluding with another 1&#xd7;1 standard convolution. In contrast, the D-SNU, shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4B</bold>
</xref>, directly partitions the input feature map channels into two branches, with the left branch incorporating a 3&#xd7;3 depthwise convolution with a stride of 2, succeeded by a 1&#xd7;1 standard convolution; the right branch mirrors this stride adjustment for its 3&#xd7;3 depthwise convolution. This architecture effectively leverages the device's parallel processing capabilities and greatly minimizes computational costs. By shuffling the feature channels, it enables interaction between different channels, thereby improving the model's feature representation and reducing its overall complexity.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Basic feature extraction module for the ShuffleNetV2 model. "Conv" denotes standard convolution; "BN" denotes batch normalization; "ReLU" denotes activation function. <bold>(A)</bold> ShuffleNet Unit (SNU). <bold>(B)</bold> Downsample ShuffleNet Unit (D-SNU).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g004.tif"/>
</fig>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Ensemble self-distillation framework</title>
<p>The framework of ensemble self-distillation based on ShuffleNetV2 1.0x is constructed as illustrated in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>. Conv1, positioned at the start of the ShuffleNetV2 and served to extract initial features from the input data, consists of 3&#xd7;3 standard convolution and BN, while Conv2, positioned at the end of the ShuffleNetV2, consists of 1&#xd7;1 standard convolution, BN, and ReLU. The ShuffleNetV2 is composed of Conv1, Stage 1, Stage 2, Stage 3, Conv2, and FC Layer 4. Each Stage is composed of one D-SNU and multiple SNUs.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Ensemble self-distillation framework based on ShuffleNetV2. "AvgPool" denotes average pooling layer; "FC Layer" denotes fully connected layer; "Softmax" denotes softmax activation function.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g005.tif"/>
</fig>
<p>Based on the structure and depth of the ShuffleNetV2, shallow models 1, 2, and 3 are constructed after Conv1, Stage 1, and Stage 2, respectively. The structure of shallow model 1 consists of Conv1, Attention Module 1, Shallow Module 1, and FC Layer 1. The structure of shallow model 2 consists of Conv1, Stage 1, Attention Module 2, Shallow Module 2, and FC Layer 2. The structure of shallow model 3 consists of Conv1, Stage 1, Stage 2, Attention Module 3, Shallow Module 3, and FC Layer 3. The deep feature maps from ShuffleNetV2 and the three shallow models are fused to construct the ensemble model. The ensemble model consists of an Ensemble Module and FC Layer 5. Before each FC Layer, an average pooling layer is placed to pool the input feature map to a size of 1&#xd7;1. Subsequently, the output of each FC Layer is processed by the softmax function.</p>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Lightweight convolution structure</title>
<p>The architecture of the shallow model impacts not only the number of parameters in the ensemble self-distillation framework, which subsequently influences the overall training time, but also the efficiency of knowledge transfer. By treating the Attention Module and Shallow Module collectively as a projector that aligns shallow features with deep features, the parameter count of this projector plays a crucial role in determining the effectiveness of knowledge distillation (<xref ref-type="bibr" rid="B1">Chen et&#xa0;al., 2022</xref>). For the sake of lightweight design, both the Attention Module and the Shallow Module within the shallow model are built based on a Lightweight Convolution Structure (LCS) (<xref ref-type="bibr" rid="B28">Zhang et al., 2022</xref>). In this structure, the input feature map has a channel count of <italic>in_c</italic>, the output feature map has a channel count of <italic>out_c</italic>, and the stride is denoted as <italic>s</italic>. This is represented as LCS(<italic>in_c, out_c, s</italic>).</p>
<p>As shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, the LCS(<italic>in_c, out_c, s</italic>) structure consists of two groups of depthwise separable convolutions. Each group of depthwise separable convolutions is composed of a 3&#xd7;3 depthwise convolution and a 1&#xd7;1 pointwise convolution. The depthwise convolution in the first group has a stride of <italic>s</italic>, while the stride for all other convolution operations is set to 1 by default.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Lightweight convolution structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g006.tif"/>
</fig>
</sec>
<sec id="s2_2_4">
<label>2.2.4</label>
<title>Attention module</title>
<p>To decide which shallow features are distilled, the Attention Module, as illustrated in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>, is introduced. The input feature map is first processed by a LCS with a stride of 2, reducing the spatial resolution through downsampling. Subsequently, an upsampling layer using bilinear interpolation with a scaling factor of 2 is applied to restore the original resolution. A sigmoid activation function is then used to generate the attention mask. Finally, the attention mask is element-wise multiplied with the input feature map to obtain the output feature map (<xref ref-type="bibr" rid="B28">Zhang et al., 2022</xref>). In detail, the LCS configurations corresponding to Attention Modules <italic>i</italic> (where <italic>i</italic>=1, 2, 3) are LCS(24,24,2), LCS(116,116,2), and LCS(232,232,2), respectively.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Attention module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g007.tif"/>
</fig>
</sec>
<sec id="s2_2_5">
<label>2.2.5</label>
<title>Shallow module</title>
<p>The architecture of the Shallow Module is constructed by stacking multiple LCS in sequence (<xref ref-type="bibr" rid="B28">Zhang et&#xa0;al., 2022</xref>). When designing the Shallow Module, the number of stacked LCSs and the size of (<italic>in_c, out_c</italic>) can be adjusted to ensure that, in terms of the number of parameters, the shallow model 1 is less than shallow model 2, and shallow model 2 is less than shallow model 3 (i.e., shallow model 1&lt; shallow model 2&lt; shallow model 3), so as to build a hierarchical structure and avoid the problem of homogenization of the models (<xref ref-type="bibr" rid="B5">Gong et&#xa0;al., 2023</xref>). Specifically, Shallow Module 1 consists of LCS(24, 116, 2), LCS(116, 464, 2), and LCS(464, 1024, 2); Shallow Module 2 is formed by stacking LCS(116, 464, 2) and LCS(464, 1024, 2); and Shallow Module 3 is composed of LCS(232, 464, 2) followed by LCS(464, 1024, 1).</p>
</sec>
<sec id="s2_2_6">
<label>2.2.6</label>
<title>Ensemble module</title>
<p>The shape of feature maps in the original model and shallow model are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. It can be observed that the output feature maps of Shallow Modules <italic>i</italic> (where <italic>i</italic>=1, 2, 3) and Conv2 have the same shape, all being 8&#xd7;8&#xd7;1024. Consequently, the four feature maps are fused into a single feature map through averaging, which serves as the input feature map for the ensemble module. The fused feature maps exhibit a higher level of redundancy, leading to insufficient feature extraction. At the same time, employing average pooling followed by classification with fully connected layers results in significant information loss. To address these issues, we apply further convolution to the fused feature maps to construct a more robust ensemble model. As illustrated in the <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>, the ensemble module is composed of 3&#xd7;3 depthwise convolution, 1&#xd7;1 standard convolution, and BN, which further extracts features based on the fused feature maps. The decision to omit the ReLU activation function is based on the fact that nonlinearity has already been introduced in each branch, and since the ensemble module is positioned deeper in the model, the use of ReLU could lead to further information loss. Other ensemble methods will be discussed in Section 3.3.1.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Layer structure and corresponding feature map shape.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" colspan="2" align="center">Layer</th>
<th valign="middle" colspan="4" align="center">Feature map shape</th>
</tr>
<tr>
<th valign="middle" align="center">shallow model 1<break/>(<italic>i</italic>=1)</th>
<th valign="middle" align="center">shallow model 2<break/>(<italic>i</italic>=2)</th>
<th valign="middle" align="center">shallow model 3<break/>(<italic>i</italic>=3)</th>
<th valign="middle" align="center">ShuffleNetV2<break/>(<italic>i</italic>=4)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" colspan="2" align="center">Input</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;3</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;3</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;3</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;3</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Conv1</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;24</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;24</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;24</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;24</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Attention Module1</td>
<td valign="middle" align="center">64&#xd7;64&#xd7;24</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Shallow<break/>Module1</td>
<td valign="middle" align="left">LCS(24,116,2)</td>
<td valign="middle" align="center">32&#xd7;32&#xd7;116</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" align="left">LCS(116,464,2)</td>
<td valign="middle" align="center">16&#xd7;16&#xd7;464</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" align="left">LCS(464,1024,2)</td>
<td valign="middle" align="center">8&#xd7;8&#xd7;1024</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Stage1</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">32&#xd7;32&#xd7;116</td>
<td valign="middle" align="center">32&#xd7;32&#xd7;116</td>
<td valign="middle" align="center">32&#xd7;32&#xd7;116</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Attention Module2</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">32&#xd7;32&#xd7;116</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="center">Shallow<break/>Module2</td>
<td valign="middle" align="left">LCS(116,464,2)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">16&#xd7;16&#xd7;464</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" align="left">LCS(464,1024,2)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">8&#xd7;8&#xd7;1024</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Stage2</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">16&#xd7;16&#xd7;232</td>
<td valign="middle" align="center">16&#xd7;16&#xd7;232</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Attention Module3</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">16&#xd7;16&#xd7;232</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="center">Shallow<break/>Module3</td>
<td valign="middle" align="left">LCS(232,464,2)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">8&#xd7;8&#xd7;464</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" align="left">LCS(464,1024,1)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">8&#xd7;8&#xd7;1024</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Stage3</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">8&#xd7;8&#xd7;464</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">Conv2</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">8&#xd7;8&#xd7;1024</td>
</tr>
<tr>
<td valign="middle" colspan="2" align="center">AvgPool <italic>i</italic>
</td>
<td valign="middle" align="center">1&#xd7;1&#xd7;1024</td>
<td valign="middle" align="center">1&#xd7;1&#xd7;1024</td>
<td valign="middle" align="center">1&#xd7;1&#xd7;1024</td>
<td valign="middle" align="center">1&#xd7;1&#xd7;1024</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Ensemble module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g008.tif"/>
</fig>
</sec>
<sec id="s2_2_7">
<label>2.2.7</label>
<title>Loss function</title>
<p>Given a training sample <italic>x</italic> and an one-hot true label <italic>y</italic>, we can get the logit output <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211d;</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (where <italic>i</italic> = 1, 2, 3, 4, 5), where each <italic>z<sub>i</sub>
</italic> represents the outputs of corresponding FC Layers mentioned above, <italic>C</italic> is the number of classes. By knowledge distillation method, we can acquire the final prediction after a softmax layer:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>exp</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:mfrac>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mi>exp</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>T</italic> denotes the hyperparameter temperature, <italic>z<sub>i,j</sub>
</italic> represents the logit of the <italic>j</italic>-th class of the <italic>i</italic>-th logit, <italic>p<sub>i,j,T</sub>
</italic> represents the probability of the <italic>j</italic>-th class of the <italic>i</italic>-th logit at temperature <italic>T</italic>. In a certain range, as <italic>T</italic> increases, the model's predictions become smoother, and the inherent 'dark knowledge' (<xref ref-type="bibr" rid="B13">Hinton et al., 2015</xref>) becomes richer. When <italic>T</italic> = 1.0, the output will become to vanilla softmax output. By introducing KL divergence loss during distillation training, the student model can learn the 'dark knowledge' from the teacher model, thereby enhancing the overall performance of the student model.</p>
<p>The proposed method employs an ensemble model as the teacher, while ShuffleNetV2 and each shallow model serving as students for distillation training. This approach incorporates three losses: <italic>L</italic>
<sub>ce</sub>, <italic>L</italic>
<sub>kd</sub>, and <italic>L</italic>
<sub>fkd</sub>. Thus, the overall objective <italic>L</italic>
<sub>total</sub> is formulated as:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mtext>total</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mtext>ce</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mtext>kd</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mtext>fkd</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>
<italic>L</italic>
<sub>ce</sub> represents the cross-entropy loss between the ensemble model, the ShuffleNetV2 model, each shallow model, and the ground truth labels of the dataset, as defined in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>. This loss function ensures that the ensemble model, the ShuffleNetV2 model, and each shallow model are all trained under the supervision of the ground truth labels from the dataset.</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mtext>ce</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>5</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xb7;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where CE represents the cross-entropy loss function, and <inline-formula>
<mml:math display="inline" id="im2">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> is a hyperparameter to balance the weight of the student models' cross-entropy loss and the logit distillation loss.</p>
<p>
<italic>L</italic>
<sub>kd</sub> represents the KL divergence loss between the ShuffleNetV2 model, each shallow model, and the ensemble model, as shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>. This loss allows the output of the ensemble model to serve as the learning target for both the ShuffleNetV2 model and the shallow model.</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mtext>kd</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>5</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>
<italic>L</italic>
<sub>fkd</sub> represents the L2 loss calculated between the feature outputs of the ensemble model and those of shallow models 1 and 2, as shown in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>. This loss aims to align the features of shallow models 1 and 2 with those of the ensemble model, enabling the shallower models to directly learn the features of the deeper model, thereby enhancing the effectiveness of the distillation training.</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mtext>fkd</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mo>|</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>5</mml:mn>
</mml:msub>
</mml:mrow>
<mml:msubsup>
<mml:mo>||</mml:mo>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where each <italic>H<sub>i</sub>
</italic> (for <italic>i</italic> = 1, 2, 3, 4, 5) represents the outputs of corresponding AvgPool layers mentioned above, and <inline-formula>
<mml:math display="inline" id="im3">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> is a scaling factor to control the magnitude of the feature distillation loss.</p>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and discussion</title>
<p>In this section, we provide a detailed description of the experimental setup, evaluation metrics, and results. Additionally, we discuss various ensemble methods, the selection of hyperparameters, and comparisons with other models. It should be noted that, to distinguish the version integrated into the ensemble self-distillation framework from the original ShuffleNetV2, we refer to it as KD-ShuffleNetV2. The architecture of KD-ShuffleNetV2 is identical to the original ShuffleNetV2, which means that they have the same number of parameters and floating-point operations (FLOPs). The only difference is that KD-ShuffleNetV2 is optimized by the proposed distillation method to have higher recognition accuracy without changing its architecture.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Experimental setup</title>
<p>The hardware used in this experiment includes an Intel<sup>&#xae;</sup> Xeon<sup>&#xae;</sup> CPU and an Nvidia Tesla P100 16G GPU. The operating system is Linux 5.15.133+, and the required tool versions are Python 3.10.13, PyTorch 2.1.2, and CUDA 12.1. The scikit-learn library was used to calculate the evaluation indictors mentioned in section 3.2. The ptflops library is utilized to compute the number of parameters and floating-point operations (FLOPs).</p>
<p>The parameters used in the experiment will affect the experimental results, and the values of each parameter are summarized in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. The model is trained for 100 epochs with a batch size of 128. The initial learning rate is set to 0.01 and decayed by 30% every 10 epochs. The parameters are updated using the stochastic gradient descent (SGD) optimizer with a weight decay of 5&#xd7;10<sup>-4</sup> and a momentum of 0.9. The hyperparameters <inline-formula>
<mml:math display="inline" id="im4">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im5">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> are set to 0.1 and 5&#xd7;10<sup>-4</sup>, respectively. Distillation temperature <italic>T</italic> is set to 3.0.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Parameter value.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Parameter</th>
<th valign="middle" align="center">Value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Batch size</td>
<td valign="middle" align="center">128</td>
</tr>
<tr>
<td valign="middle" align="center">Image size</td>
<td valign="middle" align="center">64&#xd7;64</td>
</tr>
<tr>
<td valign="middle" align="center">Optimization algorithm</td>
<td valign="middle" align="center">SGD</td>
</tr>
<tr>
<td valign="middle" align="center">Initial learning rate</td>
<td valign="middle" align="center">0.01</td>
</tr>
<tr>
<td valign="middle" align="center">Number of epochs</td>
<td valign="middle" align="center">100</td>
</tr>
<tr>
<td valign="middle" align="center">
<inline-formula>
<mml:math display="inline" id="im6">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula>
</td>
<td valign="middle" align="center">0.1</td>
</tr>
<tr>
<td valign="middle" align="center">
<inline-formula>
<mml:math display="inline" id="im7">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula>
</td>
<td valign="middle" align="center">5&#xd7;10-4</td>
</tr>
<tr>
<td valign="middle" align="center">T</td>
<td valign="middle" align="center">3.0</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Evaluation indictors</title>
<p>To evaluate the performance of the models, this paper considers and number of parameters and computer as the evaluation criteria for model complexity. For evaluating model performance, accuracy, precision, recall, and F1 score on the test set are used as the primary indicators. The calculation methods for these four performance metrics are shown in <xref ref-type="disp-formula" rid="eq6">Equations 6</xref>&#x2013;<xref ref-type="disp-formula" rid="eq9">9</xref>.</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mtext>Accuracy&#xa0;=&#xa0;</mml:mtext>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP+TN</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP+TN+FP+FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mtext>Recall</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> is the result of correctly predicting positive classification; <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> is the result of incorrectly prediction of positive classification; <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> is the result of correctly predicting negative classification; <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> is the result of incorrectly predicting negative classification.</p>
<p>After data preprocessing, the dataset in this study exhibits a relatively balanced distribution across categories, but there are still some categories with fewer samples compared to others. In cases of class imbalance, directly using unweighted metrics can lead to evaluation results that are biased toward the categories with larger sample sizes, potentially neglecting the performance of those categories with fewer samples. To provide a more comprehensive assessment of the model's performance, we used the scikit-learn library to compute precision, recall, and F1 score in a weighted average manner.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Model performance comparison</title>
<sec id="s3_3_1" sec-type="discussion">
<label>3.3.1</label>
<title>Discussion of different ensemble modules</title>
<p>In this section, we introduce four additional ensemble methods: Avg (<xref ref-type="bibr" rid="B28">Zhang et&#xa0;al., 2018</xref>), Concat (<xref ref-type="bibr" rid="B26">Wu and Gong, 2021</xref>), Naive (<xref ref-type="bibr" rid="B6">Guo et&#xa0;al., 2020</xref>), and MinLogit (<xref ref-type="bibr" rid="B6">Guo et&#xa0;al., 2020</xref>). We only adopt the ensemble ideas from these methods and use them as benchmarks to evaluate the effectiveness of the proposed approach.</p>
<list list-type="order">
<list-item>
<p>Avg method: A widely adopted standard, this method computes the average of the logits from all branches to form the ensemble model's logits.</p>
</list-item>
<list-item>
<p>Concat method: This method concatenates the logits from all branches along the channel dimension, preserving the information from each branch. Then, a fully connected layer is applied for training ensemble model.</p>
</list-item>
<list-item>
<p>Naive method: In this method, the logit with the lowest cross-entropy loss with respect to the true label is selected from all logits across branches. This selected logit serves as the teacher for all students.</p>
</list-item>
<list-item>
<p>MinLogit method: The MinLogit method selects the minimum value at each corresponding position across the logits to form the ensemble model's logits, aiming to minimize the cross-entropy loss between the ensemble's predictions and the true labels.</p>
</list-item>
</list>
<p>To improve the clarity of our experimental results, we focus exclusively on the accuracy of KD-ShuffleNetV2 and the ensemble model on the test set. The experimental results are shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. In both the Avg and MinLogit methods, the accuracy of the ensemble model did not exhibit a significantly higher level compared to the accuracy of the KD-ShuffleNetV2, which limits the teacher's capacity to transfer generalized knowledge to the student, thereby hindering the effective improvement of the student's model performance. In the Naive and Concat methods, the accuracy of the ensemble model is significantly higher than that of KD-ShuffleNetV2, but it still fails to effectively improve the accuracy of KD-ShuffleNetV2. In contrast, compared to all other ensemble schemes, the proposed approach offers a superior ensemble model, achieving an accuracy of 95.15% on the test set. This high-performance ensemble model also effectively transfers knowledge to KD-ShuffleNetV2. As a result, the KD-ShuffleNetV2 optimized by the proposed method achieves an accuracy of 95.08%, surpassing other approaches.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Results of different ensemble methods for KD-ShuffleNetV2 and Ensemble model.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Avg</th>
<th valign="top" align="center">Concat</th>
<th valign="top" align="center">Naive</th>
<th valign="top" align="center">MinLogit</th>
<th valign="top" align="center">Ours</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">KD-ShuffleNetV2</td>
<td valign="top" align="center">94.52</td>
<td valign="top" align="center">94.59</td>
<td valign="top" align="center">94.57</td>
<td valign="top" align="center">94.63</td>
<td valign="top" align="center">95.08</td>
</tr>
<tr>
<td valign="top" align="center">Ensemble model</td>
<td valign="top" align="center">94.61</td>
<td valign="top" align="center">94.86</td>
<td valign="top" align="center">95.04</td>
<td valign="top" align="center">94.63</td>
<td valign="top" align="center">95.15</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Comparison of results for different models</title>
<p>To further verify the effectiveness of the KD-ShuffleNetV2 and other models within the ensemble self-distillation framework, this paper compared them with the Vgg16 (<xref ref-type="bibr" rid="B22">Simonyan and Zisserman, 2014</xref>), ResNet18 (<xref ref-type="bibr" rid="B7">He et&#xa0;al., 2016</xref>), MobileNetV1 (<xref ref-type="bibr" rid="B10">Howard et&#xa0;al., 2017</xref>), MobileNetV2 (<xref ref-type="bibr" rid="B21">Sandler et&#xa0;al., 2018</xref>), MobileNetV3 (<xref ref-type="bibr" rid="B9">Howard et&#xa0;al., 2019</xref>) and MobileVit (<xref ref-type="bibr" rid="B16">Mehta and Rastegari, 2021</xref>) models under the same test conditions. The experimental results are presented in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>. Compared to the original ShuffleNetV2, KD-ShuffleNetV2 achieves significant improvements in accuracy, precision, recall, and F1-score, with respective gains of 1.35%, 1.36%, 1.37%, and 1.37%; Shallow Model 3 reaches an accuracy of 95.04%, representing a 1.31% increase, while maintaining a negligible change in parameter count and reducing FLOPs by 6.01%; Shallow Model 2 achieves an accuracy of 94.30%, demonstrating a 0.57% improvement, while reducing parameters by 33.71% and FLOPs by 42.08%, making it a promising choice for resource-limited environments; Shallow Model 1 achieves an accuracy of 93.21%, with a slight decrease of 0.52%, but offers substantial reductions of 37.77% in parameters and 60.11% in FLOPs. The ensemble model achieves the highest accuracy of 95.15%, with a 1.42% improvement, making it more suitable for scenarios where accuracy is prioritized, and deployment resources are abundant.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Results of different models on the test set.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Accuracy/%</th>
<th valign="middle" align="center">Precision/%</th>
<th valign="middle" align="center">Recall/%</th>
<th valign="middle" align="center">F1 score/%</th>
<th valign="middle" align="center">Params</th>
<th valign="middle" align="center">FLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Vgg16</td>
<td valign="middle" align="center">94.27</td>
<td valign="middle" align="center">93.79</td>
<td valign="middle" align="center">93.76</td>
<td valign="middle" align="center">93.75</td>
<td valign="middle" align="center">33650763</td>
<td valign="middle" align="center">1.28&#xd7;10<sup>9</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">ResNet18</td>
<td valign="middle" align="center">94.57</td>
<td valign="middle" align="center">94.14</td>
<td valign="middle" align="center">94.007</td>
<td valign="middle" align="center">94.07</td>
<td valign="middle" align="center">11271432</td>
<td valign="middle" align="center">2.23&#xd7;10<sup>9</sup>
</td>
</tr>
<tr>
<td valign="bottom" align="center">MobileNetV3_Small</td>
<td valign="bottom" align="center">92.29</td>
<td valign="bottom" align="center">91.74</td>
<td valign="bottom" align="center">91.68</td>
<td valign="bottom" align="center">91.67</td>
<td valign="bottom" align="center">1529131</td>
<td valign="bottom" align="center">5.84&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="bottom" align="center">MobileNetV3_Large</td>
<td valign="bottom" align="center">92.51</td>
<td valign="bottom" align="center">92.00</td>
<td valign="bottom" align="center">91.92</td>
<td valign="bottom" align="center">91.91</td>
<td valign="bottom" align="center">4216123</td>
<td valign="bottom" align="center">2.13&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="bottom" align="center">MobileVit_XS</td>
<td valign="bottom" align="center">93.01</td>
<td valign="bottom" align="center">92.57</td>
<td valign="bottom" align="center">92.55</td>
<td valign="bottom" align="center">92.53</td>
<td valign="bottom" align="center">2003168</td>
<td valign="bottom" align="center">5.72&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="bottom" align="center">MobileVit_S</td>
<td valign="bottom" align="center">93.03</td>
<td valign="bottom" align="center">92.47</td>
<td valign="bottom" align="center">92.41</td>
<td valign="bottom" align="center">92.40</td>
<td valign="bottom" align="center">5003760</td>
<td valign="bottom" align="center">1.10&#xd7;10<sup>8</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">MobileNetV1</td>
<td valign="middle" align="center">92.33</td>
<td valign="middle" align="center">91.92</td>
<td valign="middle" align="center">91.85</td>
<td valign="middle" align="center">91.83</td>
<td valign="middle" align="center">3224203</td>
<td valign="middle" align="center">1.93&#xd7;10<sup>8</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">MobileNetV2</td>
<td valign="middle" align="center">94.14</td>
<td valign="middle" align="center">93.65</td>
<td valign="middle" align="center">93.55</td>
<td valign="middle" align="center">93.51</td>
<td valign="middle" align="center">2255371</td>
<td valign="middle" align="center">2.28&#xd7;10<sup>8</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">ShuffleNetV2</td>
<td valign="bottom" align="center">93.73</td>
<td valign="middle" align="center">93.22</td>
<td valign="middle" align="center">93.18</td>
<td valign="middle" align="center">93.17</td>
<td valign="bottom" align="center">1269671</td>
<td valign="bottom" align="center">1.83&#xd7;10<sup>8</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">KD-ShuffleNetV2</td>
<td valign="middle" align="center">95.08</td>
<td valign="middle" align="center">94.58</td>
<td valign="middle" align="center">94.55</td>
<td valign="middle" align="center">94.54</td>
<td valign="bottom" align="center">1269671</td>
<td valign="bottom" align="center">1.83&#xd7;10<sup>8</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model3</td>
<td valign="middle" align="center">95.04</td>
<td valign="middle" align="center">94.44</td>
<td valign="middle" align="center">94.38</td>
<td valign="middle" align="center">94.38</td>
<td valign="bottom" align="center">1268890</td>
<td valign="bottom" align="center">1.72&#xd7;10<sup>8</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model2</td>
<td valign="middle" align="center">94.30</td>
<td valign="middle" align="center">93.75</td>
<td valign="middle" align="center">93.71</td>
<td valign="middle" align="center">93.70</td>
<td valign="bottom" align="center">841660</td>
<td valign="bottom" align="center">1.06&#xd7;10<sup>8</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model1</td>
<td valign="middle" align="center">93.21</td>
<td valign="middle" align="center">92.61</td>
<td valign="middle" align="center">92.50</td>
<td valign="middle" align="center">92.49</td>
<td valign="bottom" align="center">790090</td>
<td valign="bottom" align="center">7.31&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">Ensemble model</td>
<td valign="middle" align="center">95.15</td>
<td valign="middle" align="center">94.65</td>
<td valign="middle" align="center">94.63</td>
<td valign="middle" align="center">94.61</td>
<td valign="bottom" align="center">4936699</td>
<td valign="bottom" align="center">4.27&#xd7;10<sup>8</sup>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Compared to VGG16, the ensemble model and KD-ShuffleNetV2 improve accuracy by 0.88% and 0.81%, respectively, while reducing parameters by 85.33% and 96.23%, and FLOPs by 66.64% and 85.70%, respectively. Similarly, compared to ResNet18, the ensemble model and KD-ShuffleNetV2 achieve accuracy improvements of 0.58% and 0.51%, respectively, while reducing parameters by 56.20% and 88.74%, and FLOPs by 80.85% and 91.79%.</p>
<p>When compared to other listed lightweight models, KD-ShuffleNetV2 and shallow models demonstrate superior accuracy with relatively lower parameter counts. Specifically, compared to MobileNetV2, which achieves the highest accuracy among the listed lightweight models, KD-ShuffleNetV2, Shallow Model 3, and Shallow Model 2 achieve accuracy improvements of 0.94%, 0.90%, and 0.16%, respectively, while reducing parameter counts by 43.70%, 43.73%, and 62.68%, and FLOPs by 19.74%, 24.56%, and 53.51%, respectively. Compared to MobileViT_S, which has the largest parameter count among the lightweight models, KD-ShuffleNetV2 reduces parameters by 74.63% while improving accuracy by 2.05%. Similarly, Shallow Model 1 reduces parameters by 84.21% and improves accuracy by 0.18%.</p>
<p>The classification results, as visualized in the confusion matrix in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>, demonstrate the performance of the model. The vertical axis represents the 11 categories of tomato leaf diseases in the dataset, while the horizontal axis corresponds to the categories predicted by the model. Compared to the original ShuffleNetV2, except for the healthy category, KD-ShuffleNetV2 achieved a notable improvement in the number of correctly predicted samples across all categories. The most significant enhancement was observed in the late blight category, where an additional 13 samples were accurately classified.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Confusion matrices of different models.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g009.tif"/>
</fig>
</sec>
</sec>
<sec id="s3_4" sec-type="discussion">
<label>3.4</label>
<title>Discussion on data augmentation</title>
<p>To investigate the impact of data augmentation on the generalization ability of models, we conducted experiments to evaluate the accuracy of each model on the test set without data augmentation and compared them with those using data augmentation. The experimental results are shown in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>. It can be observed that after data augmentation, the accuracy of all models on the test set improved. Without data augmentation, although the accuracy of KD-ShuffleNetV2 does not reach the level of larger models such as Vgg16 and ResNet18, it still surpasses all the lightweight models listed. Specifically, the accuracy of KD-ShuffleNetV2 (92.15%) is 1.44% higher than that of MobileNetV2 (90.71%). Additionally, compared to the original ShuffleNetV2, the accuracy of KD-ShuffleNetV2 improved by 3.29% without data augmentation and by 1.35% with data augmentation, demonstrating that the proposed method effectively enhances model performance regardless of the use of data augmentation.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>The impact of data augmentation on experimental results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Vgg16</th>
<th valign="middle" align="center">ResNet18</th>
<th valign="middle" align="center">MobileNetV3_Large</th>
<th valign="middle" align="center">MobileVit_S</th>
<th valign="middle" align="center">MobileNet<break/>V2</th>
<th valign="middle" align="center">ShuffleNet<break/>V2</th>
<th valign="middle" align="center">KD-ShuffleNetV2</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">without<break/>data augment</td>
<td valign="middle" align="center">93.12</td>
<td valign="middle" align="center">92.92</td>
<td valign="middle" align="center">89.63</td>
<td valign="middle" align="center">87.62</td>
<td valign="middle" align="center">90.71</td>
<td valign="middle" align="center">88.86</td>
<td valign="middle" align="center">92.15</td>
</tr>
<tr>
<td valign="middle" align="center">with<break/>data augment</td>
<td valign="middle" align="center">94.27</td>
<td valign="middle" align="center">94.57</td>
<td valign="middle" align="center">92.51</td>
<td valign="middle" align="center">93.03</td>
<td valign="middle" align="center">94.14</td>
<td valign="middle" align="center">93.73</td>
<td valign="middle" align="center">95.08</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_5" sec-type="discussion">
<label>3.5</label>
<title>Discussion on the hyperparameter</title>
<p>To verify the effectiveness of logit distillation and feature distillation, we conducted experiments with different values of <inline-formula>
<mml:math display="inline" id="im12">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im13">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> at a temperature of 3.0. First, by fixing the initial value of <inline-formula>
<mml:math display="inline" id="im14">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> at 1&#xd7;10<sup>-4</sup>, we explored the impact of different <inline-formula>
<mml:math display="inline" id="im15">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> on the experimental results, as shown in <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref>. As <inline-formula>
<mml:math display="inline" id="im16">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> increased, the accuracy of both KD-ShuffleNetV2 and the ensemble model reached its peak at <inline-formula>
<mml:math display="inline" id="im17">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> = 0.1. Other models reached their peak slightly later, but also exhibited an increasing trend followed by a decrease. Subsequently, we fixed <inline-formula>
<mml:math display="inline" id="im18">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> at 0.1 and investigated the impact of different <inline-formula>
<mml:math display="inline" id="im19">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> on the experimental results. As shown in <xref ref-type="table" rid="T8">
<bold>Table&#xa0;8</bold>
</xref>, as <inline-formula>
<mml:math display="inline" id="im20">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> increased, there was no clear trend in the accuracy of the models. However, except for the shallow model 2, the accuracy of all other models reached peak accuracy at <inline-formula>
<mml:math display="inline" id="im21">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> = 5&#xd7;10<sup>-4</sup>.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Comparison of models' accuracy under varying <inline-formula>
<mml:math display="inline" id="im22">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> values with fixed <inline-formula>
<mml:math display="inline" id="im23">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> =1&#xd7;10<sup>-4</sup>.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Model</th>
<th valign="top" colspan="6" align="center">
<inline-formula>
<mml:math display="inline" id="im24">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<th valign="top" align="center">0</th>
<th valign="middle" align="center">0.1</th>
<th valign="middle" align="center">0.2</th>
<th valign="middle" align="center">0.3</th>
<th valign="middle" align="center">0.4</th>
<th valign="middle" align="center">0.5</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">KD-ShuffleNetV2</td>
<td valign="top" align="center">94.63</td>
<td valign="top" align="center">94.90</td>
<td valign="top" align="center">94.86</td>
<td valign="top" align="center">94.61</td>
<td valign="top" align="center">94.63</td>
<td valign="top" align="center">94.50</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model3</td>
<td valign="top" align="center">94.45</td>
<td valign="top" align="center">94.86</td>
<td valign="top" align="center">94.81</td>
<td valign="top" align="center">94.63</td>
<td valign="top" align="center">94.50</td>
<td valign="top" align="center">94.36</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model2</td>
<td valign="top" align="center">94.36</td>
<td valign="top" align="center">94.30</td>
<td valign="top" align="center">94.43</td>
<td valign="top" align="center">94.32</td>
<td valign="top" align="center">94.18</td>
<td valign="top" align="center">93.98</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model1</td>
<td valign="top" align="center">92.69</td>
<td valign="top" align="center">92.69</td>
<td valign="top" align="center">92.76</td>
<td valign="top" align="center">93.08</td>
<td valign="top" align="center">92.58</td>
<td valign="top" align="center">92.27</td>
</tr>
<tr>
<td valign="middle" align="center">Ensemble model</td>
<td valign="top" align="center">94.93</td>
<td valign="top" align="center">94.97</td>
<td valign="top" align="center">95.04</td>
<td valign="top" align="center">94.81</td>
<td valign="top" align="center">94.84</td>
<td valign="top" align="center">94.68</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>Comparison of models' accuracy under varying <inline-formula>
<mml:math display="inline" id="im25">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> values with fixed <inline-formula>
<mml:math display="inline" id="im26">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> =1&#xd7;10<sup>-4</sup>.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Model</th>
<th valign="top" colspan="6" align="center">
<inline-formula>
<mml:math display="inline" id="im27">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<th valign="top" align="center">0</th>
<th valign="middle" align="center">1&#xd7;10<sup>-4</sup>
</th>
<th valign="middle" align="center">3&#xd7;10<sup>-4</sup>
</th>
<th valign="middle" align="center">5&#xd7;10<sup>-4</sup>
</th>
<th valign="middle" align="center">7&#xd7;10<sup>-4</sup>
</th>
<th valign="middle" align="center">9&#xd7;10<sup>-4</sup>
</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">KD-ShuffleNetV2</td>
<td valign="top" align="center">94.72</td>
<td valign="top" align="center">94.90</td>
<td valign="top" align="center">94.66</td>
<td valign="top" align="center">95.08</td>
<td valign="top" align="center">94.66</td>
<td valign="top" align="center">94.45</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model3</td>
<td valign="top" align="center">94.72</td>
<td valign="top" align="center">94.86</td>
<td valign="top" align="center">94.72</td>
<td valign="top" align="center">95.04</td>
<td valign="top" align="center">94.79</td>
<td valign="top" align="center">94.54</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model2</td>
<td valign="top" align="center">94.25</td>
<td valign="top" align="center">94.30</td>
<td valign="top" align="center">94.50</td>
<td valign="top" align="center">94.30</td>
<td valign="top" align="center">94.48</td>
<td valign="top" align="center">94.30</td>
</tr>
<tr>
<td valign="middle" align="center">Shallow model1</td>
<td valign="top" align="center">92.99</td>
<td valign="top" align="center">92.69</td>
<td valign="top" align="center">93.12</td>
<td valign="top" align="center">93.21</td>
<td valign="top" align="center">93.08</td>
<td valign="top" align="center">93.19</td>
</tr>
<tr>
<td valign="middle" align="center">Ensemble model</td>
<td valign="top" align="center">95.08</td>
<td valign="top" align="center">94.97</td>
<td valign="top" align="center">95.04</td>
<td valign="top" align="center">95.15</td>
<td valign="top" align="center">94.84</td>
<td valign="top" align="center">94.88</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>When <inline-formula>
<mml:math display="inline" id="im28">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> = 0, the experiment corresponded to training without logit distillation, leading to a 0.45% decrease in accuracy compared to the optimal case with logit distillation for KD-ShuffleNetV2. Similarly, when <inline-formula>
<mml:math display="inline" id="im29">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> = 0, the experiment corresponded to training without feature distillation, resulting in a 0.36% accuracy drop compared to the optimal case with feature distillation for KD-ShuffleNetV2. This demonstrates that both distillation methods effectively transfer knowledge from the teacher model, and their combined use is essential for maintaining or enhancing the model's generalization ability.</p>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Visualization of experimental results</title>
<p>The three shallow models in the ensemble self-distillation framework are built based on KD-ShuffleNetV2. KD-ShuffleNetV2 along with each shallow model can be viewed as different branches of the overall structure. To investigate the regions each branch focuses on during training, Grad-CAM is applied to visualize the heatmap for each branch's attention to the input data. As shown in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>, it is easy to observe that, from the first to the third shallow model, the regions highlighted by Grad-CAM gradually expand, and the areas of focus for each branch differ significantly. This diversity in attention regions may contribute to the improvement of the ensemble model's performance. The ensemble model integrates information from all branches, resulting in a broader area of attention. Therefore, the ensemble model, as the teacher model, can provide more generalized knowledge to each branch. By analyzing the heatmaps of the powdery mildew samples, it can be observed that KD-ShuffleNetV2 accurately focuses on the disease-affected regions of the leaves, even though they are dispersed across different locations.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Heatmaps of all branches within the distillation framework.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1521008-g010.tif"/>
</fig>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusions</title>
<p>In order to address the challenge of deploying high-precision large models on edge devices for real-time tomato disease detection, we propose a method based on enesmble self-distillation. This method successfully improves the accuracy of KD-ShuffleNetV2, achieving not only the lowest parameter count among all the listed lightweight models but also the highest accuracy. Furthermore, its accuracy surpasses that of larger models like VGG16 and ResNet18, demonstrating the successful transfer of knowledge from the large model to the small model. The entire training process requires only one stage, significantly reducing the training cost compared to the two stages required by traditional knowledge distillation methods. In terms of creating the ensemble model, our proposed enesmble method effectively transfers knowledge to the student model, outperforming other methods, such as averaging logits. Moreover, heatmap results show that the multiple shallow models used to assist online knowledge distillation, as well as the KD-ShuffleNetV2, focus on different regions of the tomato leaf disease, enhancing the diversity of the branches and contributing to the improved performance of the ensemble model. Additionally, the multiple shallow models achieve varying levels of compression of original ShuffleNetV2. Compared to the original ShuffleNetV2, shallow model 2 improves accuracy by 0.57%, while reducing the parameter count and FLOPs by 33.71% and 42.08%, respectively. Despite these promising results, there are some limitations to our current framework. Although the proposed approach can be applied to any deep learning model, it requires customization for each specific model, which is often a complex process. Moreover, when applied to CNN models integrated with Transformers, such as MobileNetV3 and MobileVit, it often yields suboptimal results.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <uri xlink:href="https://github.com/AIChallenger/AI_Challenger_2018">https://github.com/AIChallenger/AI_Challenger_2018</uri>; <uri xlink:href="https://github.com/spMohanty/PlantVillage-Dataset">https://github.com/spMohanty/PlantVillage-Dataset</uri>; <uri xlink:href="https://doi.org/10.17632/ngdgg79rzb.1">https://doi.org/10.17632/ngdgg79rzb.1</uri>; <uri xlink:href="https://doi.org/10.1145/3371158.3371196">https://doi.org/10.1145/3371158.3371196</uri>.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>SN: Conceptualization, Methodology, Writing &#x2013; original draft. YJ: Data curation, Formal analysis, Software, Writing &#x2013; original draft. MZ: Funding acquisition, Writing &#x2013; review &amp; editing. YZ: Data curation, Project administration, Writing &#x2013; review &amp; editing. WW: Validation, Visualization, Writing &#x2013; review &amp; editing. SL: Supervision, Writing &#x2013; review &amp; editing. YC: Data curation, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. Key Research Project of Henan Province (231111210500).</p>
</sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>Authors MZ, SL, and YC were employed by Henan Chuitian Technology Corporation Limited.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Mei</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Knowledge distillation with the reused teacher classifier</article-title>,&#x201d; in <conf-name>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>New Orleans, LA, USA</conf-loc>. <fpage>11923</fpage>&#x2013;<lpage>11932</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Choudhary</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Mishra</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Goswami</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Sarangapani</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A comprehensive survey on model compression and acceleration</article-title>. <source>Artif. Intell. Rev.</source> <volume>53</volume>, <fpage>5113</fpage>&#x2013;<lpage>5155</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-020-09816-7</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="web">
<person-group person-group-type="author">
<collab>Dataset AI Challenger</collab>
</person-group> (<year>2018</year>). <source>AI Challenger 2018 Datasets</source>. Available online at: <uri xlink:href="https://github.com/AIChallenger/AI_Challenger_2018">https://github.com/AIChallenger/AI_Challenger_2018</uri> (Accessed <access-date>18 December 2024</access-date>).</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Edna Chebet</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sam</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A comparative study of fine-tuning deep learning models for plant disease identification</article-title>. <source>Comput. Electron. Agric.</source> <volume>161</volume>, <fpage>272</fpage>&#x2013;<lpage>279</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.03.032</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gong</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Qiao</surname> <given-names>R.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>Adaptive hierarchy-branch fusion for online knowledge distillation</article-title>,&#x201d; in <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name>, Vol. <volume>37</volume>.</citation>
</ref>
<ref id="B6">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Guo</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). &#x201c;<article-title>Online knowledge distillation via collaborative learning</article-title>,&#x201d; in <conf-name>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Seattle, WA, USA</conf-loc>. <fpage>11017</fpage>&#x2013;<lpage>11026</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Las Vegas, NV, USA</conf-loc>. <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Hinton</surname> <given-names>G. E.</given-names>
</name>
<name>
<surname>Vinyals</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Dean</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <source>Distilling the knowledge in a neural network</source>. Available online at: <uri xlink:href="https://arxiv.org/abs/1503.02531">https://arxiv.org/abs/1503.02531</uri> (Accessed December 25, 2024).</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Howard</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L. C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>M. X.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). &#x201c;<article-title>Searching for mobilenetv3</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, Vol. <volume>2019</volume>. <fpage>1314</fpage>&#x2013;<lpage>1324</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Howard</surname> <given-names>A. G.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Kalenichenko</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Weyand</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). &#x201c;<article-title>Mobilenets: Efficient convolutional neural networks for mobile vision applications</article-title>,&#x201d; in <conf-name>The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Seoul, Korea (South)</conf-name>, , <conf-date>Piscataway, NJ</conf-date> (<publisher-name>IEEE</publisher-name>).</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>M.-L.</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>Y.-H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Dataset of tomato leaves</article-title>. <source>Mendeley Data</source> <volume>1</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.17632/ngdgg79rzb.1</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A ResNet50-DPA model for tomato leaf disease identification</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1258658</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Tomato leaf disease recognition based on multi-task distillation learning</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1330527</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ma</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>H. T.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Shufflenet v2: Practical guidelines for efficient cnn architecture design</article-title>,&#x201d; in <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name>. <fpage>116</fpage>&#x2013;<lpage>131</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Meenakshi</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Swaraja</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Ch</surname> <given-names>U. K.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Grading of quality in tomatoes using multi-class svm</article-title>,&#x201d; in <conf-name>2019 3rd International Conference on Computing Methodologies and Communication (ICCMC)</conf-name>. <fpage>104</fpage>&#x2013;<lpage>107</lpage> (<publisher-loc>Erode, India</publisher-loc>: <publisher-name>Surya Engineering College, IEEE</publisher-name>).</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mehta</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Rastegari</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer</article-title>. <source>arXiv preprint arXiv:2110.02178</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2110.02178</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pandiyaraju</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>A. M. S.</given-names>
</name>
<name>
<surname>Praveen</surname> <given-names>J. I. R.</given-names>
</name>
<name>
<surname>Venkatraman</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>S. P.</given-names>
</name>
<name>
<surname>Aravintakshan</surname> <given-names>S. A.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Improved tomato leaf disease classification through adaptive ensemble models with exponential moving average fusion and enhanced weighted gradient optimization</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1382416</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Patil</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Yaligar</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Meena</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Comparision of performance of classifiers-svm, rf and ann in potato blight disease detection using leaf images</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Computational Intelligence and Computing research (ICCIC) Tamil Nadu, India</conf-name>, <conf-loc>Piscataway, NJ</conf-loc>. <fpage>1</fpage>&#x2013;<lpage>5</lpage> (<publisher-name>IEEE</publisher-name>).</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qin</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>D. X.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>B. D.</given-names>
</name>
<name>
<surname>Ruan</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>Z. H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H. G.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Identification of alfalfa leaf diseases using image recognition technology</article-title>. <source>PloS One</source> <volume>11</volume>, <elocation-id>e0168274</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.Pone.0168274</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rangarajan</surname> <given-names>A. K.</given-names>
</name>
<name>
<surname>Purushothaman</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Ramesh</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Tomato crop disease classification using pre-trained deep learning algorithm</article-title>. <source>Proc. Comput. Sci.</source> <volume>133</volume>, <fpage>1040</fpage>&#x2013;<lpage>1047</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.procs.2018.07.070</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhmoginov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L. C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>MobileNetV2: Inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE computer society conference on computer vision and pattern recognition, USA</conf-name>. <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Simonyan</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zisserman</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). <source>Very deep convolutional networks for large-scale image recognition</source>. Available online at: <uri xlink:href="https://arxiv.org/abs/1409.1556">https://arxiv.org/abs/1409.1556</uri> (Accessed December 25, 2024).</citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Singh</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Jain</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Jain</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Kayal</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Kumawat</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Batra</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Plantdoc: A dataset for visual plant disease detection</article-title>,&#x201d; in <conf-name>Proc. 7th ACM IKDD CoDS 25th COMAD</conf-name>. <fpage>249</fpage>&#x2013;<lpage>253</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3371158.3371196</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="web">
<person-group person-group-type="author">
<collab>SpMohanty</collab>
</person-group> (<year>2018</year>). <source>PlantVillage-dataset</source>. Available online at: <uri xlink:href="https://github.com/spMohanty/PlantVillage-Dataset">https://github.com/spMohanty/PlantVillage-Dataset</uri> (Accessed <access-date>18 December 2024</access-date>).</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Tomato leaf disease detection based on attention mechanism and multi-scale feature fusion</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1382802</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Gong</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Peer collaborative learning for online knowledge distillation</article-title>,&#x201d; in <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name>, Vol. <volume>35</volume>.</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Bao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Self-Distillation: Towards efficient and compact neural networks</article-title>,&#x201d; in <source>IEEE Transactions on Pattern Analysis and Machine Intelligence</source>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4388</fpage>&#x2013;<lpage>4403</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>ShuffleNet: An extremely efficient convolutional neural network for mobile devices</article-title>,&#x201d; in <conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Salt Lake City, UT, USA</conf-loc>. <fpage>6848</fpage>&#x2013;<lpage>6856</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Tomato leaf disease diagnosis based on improved convolution neural network by attention module</article-title>. <source>Agriculture</source> <volume>11</volume>, <elocation-id>651</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture11070651</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xing</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Tomato leaf disease identification by restructured deep residual dense network</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>28822</fpage>&#x2013;<lpage>28831</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2021.3058947</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Niu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Identification of leaf diseases in field crops based on improved ShuffleNetV2</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1342123</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>