<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="methods-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2023.1256773</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>PMVT: a lightweight vision transformer for plant disease identification on mobile devices</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Guoqiang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2374944"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Yuchao</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2370041"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Qing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yuan</surname>
<given-names>Peiyan</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Chang</surname>
<given-names>Baofang</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2372914"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Institute of Agricultural Economics and Information, Henan Academy of Agricultural Sciences</institution>, <addr-line>Zhengzhou, Henan</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>College of Computer and Information Engineering, Henan Normal University</institution>, <addr-line>Xinxiang, Henan</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Key Laboratory of Artificial Intelligence and Personalized Learning in Education of Henan Province</institution>, <addr-line>Xinxiang, Henan</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Ce Yang, University of Minnesota Twin Cities, United States</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Congliang Zhou, University of Florida, United States; Lingxian Zhang, China Agricultural University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Baofang Chang, <email xlink:href="mailto:changbaofang@htu.edu.cn">changbaofang@htu.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>26</day>
<month>09</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1256773</elocation-id>
<history>
<date date-type="received">
<day>11</day>
<month>07</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>08</day>
<month>09</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Li, Wang, Zhao, Yuan and Chang</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Li, Wang, Zhao, Yuan and Chang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Due to the constraints of agricultural computing resources and the diversity of plant diseases, it is challenging to achieve the desired accuracy rate while keeping the network lightweight. In this paper, we proposed a computationally efficient deep learning architecture based on the mobile vision transformer (MobileViT) for real-time detection of plant diseases, which we called plant-based MobileViT (PMVT). Our proposed model was designed to be highly accurate and low-cost, making it suitable for deployment on mobile devices with limited resources. Specifically, we replaced the convolution block in MobileViT with an inverted residual structure that employs a 7&#xd7;7 convolution kernel to effectively model long-distance dependencies between different leaves in plant disease images. Furthermore, inspired by the concept of multi-level attention in computer vision tasks, we integrated a convolutional block attention module (CBAM) into the standard ViT encoder. This integration allows the network to effectively avoid irrelevant information and focus on essential features. The PMVT network achieves reduced parameter counts compared to alternative networks on various mobile devices while maintaining high accuracy across different vision tasks. Extensive experiments on multiple agricultural datasets, including wheat, coffee, and rice, demonstrate that the proposed method outperforms the current best lightweight and heavyweight models. On the wheat dataset, PMVT achieves the highest accuracy of 93.6% using approximately 0.98 million (M) parameters. This accuracy is 1.6% higher than that of MobileNetV3. Under the same parameters, PMVT achieved an accuracy of 85.4% on the coffee dataset, surpassing SqueezeNet by 2.3%. Furthermore, out method achieved an accuracy of 93.1% on the rice dataset, surpassing MobileNetV3 by 3.4%. Additionally, we developed a plant disease diagnosis app and successfully used the trained PMVT model to identify plant disease in different scenarios.</p>
</abstract>
<kwd-group>
<kwd>plant disease identification</kwd>
<kwd>vision transformer</kwd>
<kwd>lightweight model</kwd>
<kwd>attention module</kwd>
<kwd>APP</kwd>
</kwd-group>
<counts>
<fig-count count="12"/>
<table-count count="4"/>
<equation-count count="6"/>
<ref-count count="33"/>
<page-count count="12"/>
<word-count count="4992"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Plant disease is one of the contributing factors to the global decrease in grain production (<xref ref-type="bibr" rid="B21">Savary et&#xa0;al., 2019</xref>), and real-time detection of plant disease has an important impact on the agricultural industry. Applying deep learning models significantly simplifies the entire process and enables end-to-end technical services. Currently, there are two typical architectures for plant disease recognition: convolutional neural network (CNN)-based architectures and vision transformer (ViT)-based architectures. These methods extract explicit features from images and automatically perform classification, which is key for plant disease recognition.</p>
<p>Over the past few years, the application of CNNs to identifying plant diseases has gained in popularity with the development of artificial intelligence technology. For instance, <xref ref-type="bibr" rid="B2">Akshai and Anitha (2021)</xref> compared various CNNs using the PlantVillage dataset (<xref ref-type="bibr" rid="B10">Hughes and Salathe 2015</xref>) and reported that the DenseNet model with feature map reuse achieved the highest accuracy of 98.27%. Another study by <xref ref-type="bibr" rid="B31">Yu et&#xa0;al. (2022a)</xref> used a ResNet network with a residual structure to identify apple leaf diseases, and it obtained an average F1-score of 95.70%. CNNs can efficiently extract significant features from images and accomplish plant disease identification automatically. The primary reason for this is that CNNs have the characteristic of parameter sharing, which reduces the number of parameters in the model and addresses the overfitting issue seen in computer vision tasks. Therefore, the application of deep learning technology based on CNNs has made significant progress in plant disease diagnosis (<xref ref-type="bibr" rid="B7">Hasan et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B30">Xiong et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B1">Ahmad et&#xa0;al., 2022</xref>). Nonetheless, there will be an increase in unnecessary computational overhead as a network&#x2019;s depth increases. Simultaneously, the convolutional layer of CNNs only considers the characteristics of the local area during convolution and does not explicitly incorporate the positional information of pixels. This will impact the effectiveness of a plant disease identification model.</p>
<p>To address the above issues, <xref ref-type="bibr" rid="B5">Dosovitskiy et&#xa0;al. (2020)</xref> proposed a vision transformer (ViT) architecture based on a self-attention mechanism (<xref ref-type="bibr" rid="B27">Vaswani et&#xa0;al., 2017</xref>) to replace the traditional CNN for image recognition. A ViT architecture divides an image into non-overlapping patches and applies multi-head self-attention within the transformer encoder to learn representations of patches. Although this paradigm considers the global relationship of images and has achieved satisfactory results in plant disease recognition, it usually requires a large quantity of training data to achieve relatively high accuracy. Hence, alternating the use of CNNs and ViTs to extract more comprehensive features has become a better choice in plant disease diagnosis. Take a classic case: <xref ref-type="bibr" rid="B17">Lu et&#xa0;al. (2022)</xref> introduced a ghost module into the ViT encoder, which extracts different levels of features in an image. Their model achieved an accuracy rate of 98.14% in detecting grape leaf diseases and insect pests in the field. Similarly, <xref ref-type="bibr" rid="B33">Yu et&#xa0;al. (2023)</xref> used inception blocks to enhance the ability of the ViT encoder to extract local information; they achieved optimal performance on four typical plant disease datasets. As an alternative architectural paradigm to CNNs, the ViT has attracted significant attention and achieved considerable success in the field of computer vision (<xref ref-type="bibr" rid="B12">Khan et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B14">Lin et&#xa0;al., 2022</xref>).</p>
<p>With the significant advancements of CNNs and ViT networks in plant disease recognition technology, a prevailing trend among network models is to augment the number of parameters in order to enhance performance. These enhancements in performance are accompanied by an increase in model size (network parameters) and latency (<xref ref-type="bibr" rid="B6">Han et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B29">Wu et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B32">Yu et&#xa0;al., 2022b</xref>). They overlook a common issue: plant disease identification is typically conducted on edge devices, such as smartphones and embedded devices. Such devices usually have restricted computing power, storage capacity, and energy supply. Hence, using a lightweight network can decrease the size and computational complexity of the model, thereby improving its compatibility with resource constraints. Numerous researchers have recently been studying the application of affordable network models for real-time plant disease detection. Concretely, <xref ref-type="bibr" rid="B3">Bao et&#xa0;al. (2021)</xref> proposed SimpleNet, which achieved 94.10% wheat recognition accuracy with only 2.13 million (M) parameters. In addition, the apple leaf disease identification method based on the cascade backbone network (CBNet) proposed by (<xref ref-type="bibr" rid="B23">Sheng et&#xa0;al., 2022</xref>) achieved an accuracy rate of 96.76%. Moreover, the VGG-ICNN model proposed by <xref ref-type="bibr" rid="B25">Thakur et&#xa0;al. (2023)</xref> has 6 M parameters, which is lower than most deep learning models; and it performs well on multiple datasets such as apple, corn, and rice. Generally, the methods mentioned above primarily concentrate on identifying a single plant disease, while other methods exhibit imbalances in identification accuracy and calculation cost. Hence, to enhance the real-time performance of plant disease identification, it is crucial to employ a low-latency and highly accurate network model.</p>
<p>Achieving high-accuracy and low-cost plant disease identification in agricultural environments with limited computing resources presents a significant challenge. The majority of existing lightweight networks focus on a single plant disease. However, when faced with numerous types of plant diseases, they fail to deliver satisfactory performance. In this paper, we introduced a lightweight model for plant disease diagnosis based on MobileViT (<xref ref-type="bibr" rid="B19">Mehta and Rastegari, 2021</xref>), which has a low computational cost and is competitive in terms of inference speed. In particular, the crisscrossing leaves in the agricultural dataset lead to an unsatisfactory recognition effect with MobileViT. Thus, we consider using a larger convolution kernel (7 &#xd7; 7) to analyze the connection between different leaves. Using larger convolution kernels allows us to model the dependencies between long-distance pixels (<xref ref-type="bibr" rid="B15">Liu et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B16">Liu et&#xa0;al., 2022</xref>) and enhance the ability of the model to capture global information from plant disease images. Additionally, focusing on the salient leaf regions in plant images can improve the robustness of the model. We used the CBAM (<xref ref-type="bibr" rid="B28">Woo et&#xa0;al., 2018</xref>) to adjust feature weights in various channels of the transformer encoder. Finally, we employed a residual network to fuse the initial feature map and improve the fitting ability of the model. We named this model plant-based MobileViT (PMVT) and deployed it to identify plant diseases in datasets and in various scenarios. Experimental results indicate that PMVT surpasses the current leading lightweight networks and heavyweight models, thereby demonstrating its effectiveness as a versatile backbone network across various datasets.</p>
<p>The main contributions of this paper are as follows.</p>
<list list-type="bullet">
<list-item>
<p>We used a low-cost ViT model for plant disease diagnosis. This model is computationally efficient and can function as a generic backbone network on mobile devices.</p>
</list-item>
<list-item>
<p>We introduced a 7 &#xd7; 7-sized convolution kernel into the convolution block for modeling long-distance pixel-to-pixel dependencies. Moreover, the CBAM guides the network to learn the weights between various channels, which enhances the fitting ability of MobileViT to image feature representation.</p>
</list-item>
<list-item>
<p>We conducted comparative experiments on several datasets obtained under different scenarios, and the results revealed that our method not only competes with similarly sized lightweight networks but also outperforms state-of-the-art heavyweight networks.</p>
</list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Datasets</title>
<p>We randomly divided three datasets into a training set, validation set, and testing set according to the ratio of 8:1:1. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> shows the details of each dataset and how many samples comprised each subset. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> displays some samples of the datasets.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Data distributions for the datasets used in our comparative experiments.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Name</th>
<th valign="top" align="center">Class</th>
<th valign="top" align="left">Diseases</th>
<th valign="top" align="center">Training set size</th>
<th valign="top" align="center">Validation set size</th>
<th valign="top" align="left">Testing set size</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="7" align="left">Wheat</td>
<td valign="top" align="center">0</td>
<td valign="top" align="left">health</td>
<td valign="top" align="center">528</td>
<td valign="top" align="center">65</td>
<td valign="top" align="center">59</td>
</tr>
<tr>
<td valign="top" align="center">1</td>
<td valign="top" align="left">rust</td>
<td valign="top" align="center">673</td>
<td valign="top" align="center">83</td>
<td valign="top" align="center">77</td>
</tr>
<tr>
<td valign="top" align="center">2</td>
<td valign="top" align="left">mildew</td>
<td valign="top" align="center">282</td>
<td valign="top" align="center">34</td>
<td valign="top" align="center">32</td>
</tr>
<tr>
<td valign="top" align="center">3</td>
<td valign="top" align="left">smut</td>
<td valign="top" align="center">674</td>
<td valign="top" align="center">83</td>
<td valign="top" align="center">75</td>
</tr>
<tr>
<td valign="top" align="center">4</td>
<td valign="top" align="left">root rot</td>
<td valign="top" align="center">381</td>
<td valign="top" align="center">46</td>
<td valign="top" align="center">41</td>
</tr>
<tr>
<td valign="top" align="center">5</td>
<td valign="top" align="left">scab</td>
<td valign="top" align="center">391</td>
<td valign="top" align="center">48</td>
<td valign="top" align="center">45</td>
</tr>
<tr>
<td valign="top" align="center">6</td>
<td valign="top" align="left">leaf spot</td>
<td valign="top" align="center">378</td>
<td valign="top" align="center">47</td>
<td valign="top" align="center">45</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">Coffee</td>
<td valign="top" align="center">0</td>
<td valign="top" align="left">healthy red</td>
<td valign="top" align="center">353</td>
<td valign="top" align="center">43</td>
<td valign="top" align="center">39</td>
</tr>
<tr>
<td valign="top" align="center">1</td>
<td valign="top" align="left">spider mite</td>
<td valign="top" align="center">136</td>
<td valign="top" align="center">16</td>
<td valign="top" align="center">15</td>
</tr>
<tr>
<td valign="top" align="center">2</td>
<td valign="top" align="left">rust</td>
<td valign="top" align="center">324</td>
<td valign="top" align="center">39</td>
<td valign="top" align="center">35</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">Rice</td>
<td valign="top" align="center">0</td>
<td valign="top" align="left">healthy</td>
<td valign="top" align="center">407</td>
<td valign="top" align="center">50</td>
<td valign="top" align="center">44</td>
</tr>
<tr>
<td valign="top" align="center">1</td>
<td valign="top" align="left">unhealthy</td>
<td valign="top" align="center">413</td>
<td valign="top" align="center">50</td>
<td valign="top" align="center">43</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Sample images from the <bold>(A)</bold> wheat dataset, <bold>(B)</bold> coffee dataset, and <bold>(C)</bold> rice dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g001.tif"/>
</fig>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Wheat</title>
<p>The wheat (<xref ref-type="bibr" rid="B13">Lian, 2022</xref>) dataset comprises 4087 images of varying sizes depicting seven different categories of wheat diseases. The images include the real-world environmental factors that interfere with identifying the wheat crop, such as sky, soil, and weeds.</p>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Coffee</title>
<p>The coffee (<xref ref-type="bibr" rid="B20">Parraga-Alava et&#xa0;al., 2019</xref>) dataset contains three types of coffee leaves: healthy, red spider mite, and rust. Images of the same size and resolution are included in each category of leaves. The dataset was collected in a natural field environment, where the background of the pictures contains various disturbances such as weeds and soil. Since some sample features are not significant enough, we selected a thousand of them to build a new dataset.</p>
</sec>
<sec id="s2_1_3">
<label>2.1.3</label>
<title>Rice</title>
<p>The rice (<xref ref-type="bibr" rid="B22">Sethy, 2020</xref>) dataset lends itself to the classical binary classification problem as it contains samples classified simply as either healthy or unhealthy rice. The resolution of the images in this dataset varies in size. Furthermore, some of the images in this dataset have a uniform white background, which makes the dataset ideal for testing model performance in both a controlled laboratory environment and a real field environment.</p>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Our proposed method</title>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>Overall structure of PMVT</title>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> depicts the overall structure of our model, which comprises five layers. Before pushing input into the block, the feature map is downsampled using a 3 &#xd7; 3 convolution; this is followed by an inverted residual block or a standard transformer encoder. The inverted residual block is used to extract local features of the image and capture the long-distance dependencies between distant pixels. The MobileViT block uses a self-attention mechanism to model the global relationship of the image and employs a CBAM block to make up the channel attention and spatial attention information. The channel dimension is expanded by four times using a 1 &#xd7; 1 convolution in the last layer of the network to better adapt to computer classification tasks. PMVT contains three different network sizes: extra extra small (XXS); extra small (XS); and small (S)). These sizes correspond to those in MobileViT.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Overview of the PMVT model. &#x2193;2 means to downsample the feature map twice, and L stands for repeated stacking of L MobileViT blocks. For computer vision classification tasks, we use a classifier composed of an average pooling layer and a fully connected layer.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g002.tif"/>
</fig>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Inverted residual block</title>
<p>An inverted residual block is a standard convolutional structure comprising three convolution kernels. Before extracting image features, a 1 &#xd7; 1 convolution kernel is used to increase the channel dimension, generally by two times. Then, we replace the 3 &#xd7; 3 convolution kernel of the original MobileViT with a 7 &#xd7; 7 convolution kernel, thus making it easier to capture long-distance dependencies between pixels. In addition, depthwise separable convolutions are used to reduce the computational complexity of the model and increase the inference speed. Finally, we use a 1 &#xd7; 1 convolution kernel to restore the channel dimension of the image. <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref> shows the overall structure of the inverted residual block.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Structure of the inverted residual block. C<sub>&#xd7;</sub> represents the feature information obtained by convolving each channel of the feature map using a convolution kernel.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g003.tif"/>
</fig>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Mobile ViT block</title>
<p>As described in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4A</bold>
</xref>, learning global representations of feature maps using 1 &#xd7; 1 and 3 &#xd7; 3 convolutions. Before entering the standard transformer encoder, the same color patch at the same position is taken out and put into the same sequence for self-attention calculation. This measure allows us to learn the global representation information of the image in a more blocky manner and reduce the computational cost of the self-attention mechanism. Through the 1 &#xd7; 1 convolution kernel, the output of the transformer is restored to the original channel dimension, and the channel attention and spatial attention information are learned through the CBAM block. Finally, the obtained feature map is spliced with the original feature map to prevent loss of feature information and is then input to the next stage after a 3 &#xd7; 3 convolution.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Detailed description of the vision transformer block. <bold>(A)</bold> The overall structure of the vision transformer block; <bold>(B)</bold> the structure of the vision transformer block encoder; and <bold>(C)</bold> the architecture of the CBAM block, where &#x2297; represents the multiplication with the original feature map.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g004.tif"/>
</fig>
</sec>
<sec id="s2_2_4">
<label>2.2.4</label>
<title>Vision transformer encoder</title>
<p>As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4B</bold>
</xref>, the encoder used to learn image features consisting of standard transformer blocks. First, an image with dimensions [C<italic>,H,W</italic>] is divided into patches of P size, and a linear transformation is applied to each patch for flattening. Positional encoding information is then applied to each patch; through this, each patch then has dimensions of <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Next, we use three learnable parameter matrices to multiply each patch to get queries(<italic>W<sup>Q</sup>
</italic>), keys (<italic>W<sup>K</sup>
</italic>), and values (<italic>W<sup>V</sup>
</italic>). For patch i, we apply the dot product to the query matrix with the key matrix of the remaining patches, and then we divide by the number of key matrix elements. Finally, we apply the softmax function to obtain the attention scores of the remaining patches for patch i. These attention scores are multiplied by the value matrix of patch i to obtain the feature information. Equation 1 illustrates the process of the entire attention mechanism. MLP comprises two fully connected layers and employs an incentive compression mechanism to learn interaction information between different dimensions.</p>
<disp-formula>
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>max</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s2_2_5">
<label>2.2.5</label>
<title>CBAM block</title>
<p>The CBAM block is composed of a channel attention module and a spatial attention module, and it uses a 3 &#xd7; 3 convolution kernel to preprocess the feature map before insertion. We pass the input feature map through a parallel average pooling layer and max pooling layer, and then we change the feature map from [C<italic>,H,W</italic>] to [C,1,1] dimensions. The shared MLP module comprises two 1 &#xd7; 1 convolution kernels, which compress the number of channels to R times the original number and then expand it back to the original number of channels. The feature maps obtained by the average pooling layer and the max pooling layer are spliced to obtain the weights of each channel, which are finally multiplied by the original feature map. Equation 2 describes the weight assignment process of the channel attention module. <italic>&#x3c3;</italic> stands for using Sigmoid as the activation function, W<sub>1</sub> &#x2208; &#x211d;<italic>
<sup>C/r</sup>
</italic>
<sup>&#xd7;</sup>
<italic>
<sup>C</sup>
</italic>, and W<sub>1</sub> &#x2208; &#x211d;<italic>
<sup>C/r</sup>
</italic>
<sup>&#xd7;</sup>
<italic>
<sup>C</sup>
</italic>. W<sub>1</sub> and W<sub>0</sub> are shared weights for the two inputs of the max pooling layer and the average pooling layer.</p>
<disp-formula>
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mi>c</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
<mml:mi>c</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mtext>&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;</mml:mtext>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The output of the channel attention module is obtained through the max pooling layer and average pooling layer. We acquire two feature maps with dimensions of [1<italic>,H,W</italic>], and then we splice them. Through a 7 &#xd7; 7 convolution, we obtain a feature map of one channel and multiply it by the original feature map. Equation 3 shows the forward process of the spatial attention module, while <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4C</bold>
</xref> shows the forward process of the entire CBAM block.</p>
<disp-formula>
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>7</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>7</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mi>s</mml:mi>
</mml:msubsup>
<mml:mo>;</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
<mml:mi>s</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mtext>&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;</mml:mtext>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>App for plant disease identification</title>
<p>We export the trained model to an open neural network exchange (ONNX) file format to preserve crucial details such as structure and weights. The model is converted into an NCNN file format for storage to facilitate deployment on a mobile terminal for inference because the NCNN format is a high-performance neural network inference framework optimized for mobile platforms. Subsequently, the structure and weight information of the model are extracted for plant disease identification using the C++ language. The XML language is used to define the layout and appearance of the application front-end interface. Lastly, the back-end interaction of the application is developed using the JAVA language, while the MySQL database is used for storing plant diseases and related information. As shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, the app possesses the capability to perform photo identification using the camera of the device (<xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5B</bold>
</xref>). Alternatively, it allows users to select pictures from their album for identification (<xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5C</bold>
</xref>). Furthermore, users have the option to search for plant diseases based on specific conditions or criteria (<xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5D</bold>
</xref>). The application then presents the relevant categories of plant diseases based on the selected pictures or conditions. <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5E</bold>
</xref> displays the final identification results of plant diseases and the corresponding control methods.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Introduction of plant disease identification app. <bold>(A)</bold> the main page of the app; <bold>(B)</bold> the page for camera recognition; <bold>(C)</bold> the page to select local albums for recognition; <bold>(D)</bold> the page for disease search; and <bold>(E)</bold> the page displaying disease identification results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g005.tif"/>
</fig>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Experimental details</title>
<p>Data augmentation has been shown to improve model robustness and generalization. Before training the network, all images are uniformly resized to 224 &#xd7; 224. The samples in the training, validation, and test sets are randomly rotated and cropped along the center. Finally, we normalize all images using standard deviation and mean square deviation. <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref> describes our hyperparameter settings for model training.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Hyperparameter settings for training.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Name</th>
<th valign="top" align="left">Value</th>
<th valign="top" align="left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Epochs</td>
<td valign="top" align="left">100</td>
<td valign="top" align="left">Number of times the model was trained</td>
</tr>
<tr>
<td valign="top" align="left">Batch size</td>
<td valign="top" align="left">32</td>
<td valign="top" align="left">Number of samples selected for one training</td>
</tr>
<tr>
<td valign="top" align="left">Optimizer</td>
<td valign="top" align="left">AdamW</td>
<td valign="top" align="left">Tool used to bootstrap network update parameters</td>
</tr>
<tr>
<td valign="top" align="left">Learning rate</td>
<td valign="top" align="left">0.0001</td>
<td valign="top" align="left">Tunes parameters in optimization algorithms</td>
</tr>
<tr>
<td valign="top" align="left">Loss function</td>
<td valign="top" align="left">Cross Entropy</td>
<td valign="top" align="left">Evaluates the gap between the predicted value and the true value</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Model evaluation</title>
<p>In this study, we use top-1 accuracy (Equation 4) to determine the highest accuracy that the model can achieve. We also use precision (Equation 5) and recall (Equation 6) to evaluate the performance of the model. Parameters, floating point operations per second (FLOPs), and frames per second (FPS; the number of images the model processes per second) are used to express the inference speed of the model. True positive (TP) means that the predicted positive sample is actually a positive sample; false positive (FP) indicates that the predicted positive sample is actually a negative sample; true negative (TN) means that the predicted negative sample is actually a negative sample; and false negative (FN) means that the predicted negative sample is actually a positive sample.</p>
<disp-formula>
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">u</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mtext>Recall</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s2_6">
<label>2.6</label>
<title>Experimental setup</title>
<p>All experiments run on a deep learning&#x2013;based cloud platform. The hardware configuration is a 14-Core VV Intel(R) Xeon(R) Gold 6330 CPU @ 2.00 GHz, with 45 GB of RAM and an NVIDIA GeForce RTX 3090 GPU. The operating system is Ubuntu 18.04, and PyTorch 1.9.0 and Python 3.8 are used as software support.</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and conclusions</title>
<sec id="s3_1" sec-type="results">
<label>3.1</label>
<title>Results</title>
<p>We selected several typically used CNN-based and ViT-based networks for comparison with our model. These include lightweight networks such as SqueezeNet (<xref ref-type="bibr" rid="B11">Iandola et&#xa0;al., 2016</xref>), ShuffleNetV2 (<xref ref-type="bibr" rid="B18">Ma et&#xa0;al., 2018</xref>), MobileNetV3 (<xref ref-type="bibr" rid="B9">Howard et&#xa0;al., 2019</xref>), MobileFormer (<xref ref-type="bibr" rid="B4">Chen et&#xa0;al., 2022</xref>), EfficientNet (<xref ref-type="bibr" rid="B24">Tan and Le, 2019</xref>), and Deit (<xref ref-type="bibr" rid="B26">Touvron et&#xa0;al., 2021</xref>) models. We also chose many heavyweight networks such as PoolFormer (<xref ref-type="bibr" rid="B32">Yu et&#xa0;al., 2022b</xref>), CVT (<xref ref-type="bibr" rid="B29">Wu et&#xa0;al., 2021</xref>), TNT (<xref ref-type="bibr" rid="B6">Han et&#xa0;al., 2021</xref>), and ResNet (<xref ref-type="bibr" rid="B8">He et&#xa0;al., 2016</xref>) for comparison. Additionally, we chose a wheat dataset with multiple components (such as roots, stems, and leaves) to evaluate model performance on images depicting diverse conditions. The coffee dataset was employed to assess the performance of our method when confronted with complex backgrounds. Moreover, the rice dataset was used to investigate the classical binary classification problem.</p>
<p>We chose the wheat dataset to verify the generalizability of PMVT under a real crop growth cycle. We can see from <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref> that our proposed network achieved the best top-1 accuracy when compared with networks with similar parameters. Among the lightweight networks, MobileNetV3 achieved an accuracy rate of 92.0%, whereas EfficientNet-B0 achieved a higher accuracy rate of 94.1%. Our PMVT reached state-of-the-art accuracy with rates of 93.6 and 94.7, respectively. In comparing heavyweight networks, the PMVT model achieved an accuracy rate of 94.9% using only 5.06 M parameters, outperforming ResNet-101, which achieved an accuracy of 94.1% but used 42.5 M parameters. This proves that the proposed model is effective compared to the original MobileViT. <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref> presents the confusion matrix of our proposed model. <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref> depicts the precision of the PMVT model, while <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref> illustrates its recall.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of the PMVT model with other backbone models on three datasets (the FPS indicator is calculated on the desktop computer, and bold text highlights the best-performing network).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" rowspan="2" align="left">Methods</th>
<th valign="top" colspan="3" align="center">Top-1 Accuracy(%)</th>
<th valign="top" rowspan="2" align="center">Parameters (M)</th>
<th valign="top" rowspan="2" align="center">FLOPs (G)</th>
<th valign="top" rowspan="2" align="center">FPS (img/s)</th>
</tr>
<tr>
<th valign="top" align="center">Wheat</th>
<th valign="top" align="center">Coffee</th>
<th valign="top" align="center">Rice</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">SqueezeNet-1.0</td>
<td valign="top" align="center">70.0</td>
<td valign="top" align="center">79.7</td>
<td valign="top" align="center">86.2</td>
<td valign="top" align="center">0.74</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">293.0</td>
</tr>
<tr>
<td valign="top" align="left">SqueezeNet-1.1</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">83.1</td>
<td valign="top" align="center">85.1</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.26</td>
<td valign="top" align="center">
<bold>311.5</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">ShuffleNetV2-1.0</td>
<td valign="top" align="center">89.6</td>
<td valign="top" align="center">68.5</td>
<td valign="top" align="center">82.7</td>
<td valign="top" align="center">1.27</td>
<td valign="top" align="center">0.15</td>
<td valign="top" align="center">151.9</td>
</tr>
<tr>
<td valign="top" align="left">MobileNetV3-Small</td>
<td valign="top" align="center">92.0</td>
<td valign="top" align="center">66.3</td>
<td valign="top" align="center">89.7</td>
<td valign="top" align="center">1.54</td>
<td valign="top" align="center">
<bold>0.06</bold>
</td>
<td valign="top" align="center">170.2</td>
</tr>
<tr>
<td valign="top" align="left">PMVT-XXS (ours)</td>
<td valign="top" align="center">
<bold>93.6</bold>
</td>
<td valign="top" align="center">
<bold>85.4</bold>
</td>
<td valign="top" align="center">
<bold>93.1</bold>
</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.31</td>
<td valign="top" align="center">88.5</td>
</tr>
<tr>
<td valign="top" align="left">ShuffleNetV2-1.5</td>
<td valign="top" align="center">92.5</td>
<td valign="top" align="center">73.0</td>
<td valign="top" align="center">86.2</td>
<td valign="top" align="center">2.50</td>
<td valign="top" align="center">0.31</td>
<td valign="top" align="center">148.4</td>
</tr>
<tr>
<td valign="top" align="left">MobileFormer-26M</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">77.5</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">2.22</td>
<td valign="top" align="center">
<bold>0.03</bold>
</td>
<td valign="top" align="center">53.1</td>
</tr>
<tr>
<td valign="top" align="left">MobileFormer-52M</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">79.2</td>
<td valign="top" align="center">83.9</td>
<td valign="top" align="center">2.46</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">60.7</td>
</tr>
<tr>
<td valign="top" align="left">MobileFormer-96M</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">84.2</td>
<td valign="top" align="center">87.3</td>
<td valign="top" align="center">3.33</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">58.8</td>
</tr>
<tr>
<td valign="top" align="left">MobileNetV3-Large</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">72.0</td>
<td valign="top" align="center">91.9</td>
<td valign="top" align="center">4.22</td>
<td valign="top" align="center">0.23</td>
<td valign="top" align="center">
<bold>141.0</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">EfficientNet-B0</td>
<td valign="top" align="center">94.1</td>
<td valign="top" align="center">84.2</td>
<td valign="top" align="center">88.5</td>
<td valign="top" align="center">4.03</td>
<td valign="top" align="center">0.41</td>
<td valign="top" align="center">109.9</td>
</tr>
<tr>
<td valign="top" align="left">PMVT-XS (ours)</td>
<td valign="top" align="center">
<bold>94.7</bold>
</td>
<td valign="top" align="center">
<bold>86.5</bold>
</td>
<td valign="top" align="center">
<bold>97.7</bold>
</td>
<td valign="top" align="center">2.01</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">85.3</td>
</tr>
<tr>
<td valign="top" align="left">ShuffleNetV2-2.0</td>
<td valign="top" align="center">93.6</td>
<td valign="top" align="center">70.0</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">5.38</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">146.2</td>
</tr>
<tr>
<td valign="top" align="left">MobileFormer-151M</td>
<td valign="top" align="center">94.4</td>
<td valign="top" align="center">75.3</td>
<td valign="top" align="center">88.5</td>
<td valign="top" align="center">6.34</td>
<td valign="top" align="center">
<bold>0.10</bold>
</td>
<td valign="top" align="center">42.3</td>
</tr>
<tr>
<td valign="top" align="left">EfficientNet-B1</td>
<td valign="top" align="center">94.4</td>
<td valign="top" align="center">79.8</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">6.53</td>
<td valign="top" align="center">0.61</td>
<td valign="top" align="center">75.3</td>
</tr>
<tr>
<td valign="top" align="left">EfficientNet-B2</td>
<td valign="top" align="center">93.3</td>
<td valign="top" align="center">83.1</td>
<td valign="top" align="center">87.3</td>
<td valign="top" align="center">7.72</td>
<td valign="top" align="center">0.70</td>
<td valign="top" align="center">76.6</td>
</tr>
<tr>
<td valign="top" align="left">Deit-Tiny</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">78.7</td>
<td valign="top" align="center">84.0</td>
<td valign="top" align="center">5.49</td>
<td valign="top" align="center">1.08</td>
<td valign="top" align="center">161.7</td>
</tr>
<tr>
<td valign="top" align="left">PoolFormer-S12</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">85.1</td>
<td valign="top" align="center">11.39</td>
<td valign="top" align="center">1.81</td>
<td valign="top" align="center">
<bold>178.3</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">CVT-Tiny</td>
<td valign="top" align="center">93.6</td>
<td valign="top" align="center">82.0</td>
<td valign="top" align="center">86.2</td>
<td valign="top" align="center">19.63</td>
<td valign="top" align="center">4.08</td>
<td valign="top" align="center">62.2</td>
</tr>
<tr>
<td valign="top" align="left">TNT-Small</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">80.9</td>
<td valign="top" align="center">88.5</td>
<td valign="top" align="center">23.40</td>
<td valign="top" align="center">4.85</td>
<td valign="top" align="center">67.3</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">70.8</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">23.53</td>
<td valign="top" align="center">4.13</td>
<td valign="top" align="center">125.1</td>
</tr>
<tr>
<td valign="top" align="left">ResNet101</td>
<td valign="top" align="center">94.1</td>
<td valign="top" align="center">63.0</td>
<td valign="top" align="center">88.5</td>
<td valign="top" align="center">42.50</td>
<td valign="top" align="center">7.86</td>
<td valign="top" align="center">66.3</td>
</tr>
<tr>
<td valign="top" align="left">PMVT-S (ours)</td>
<td valign="top" align="center">
<bold>94.9</bold>
</td>
<td valign="top" align="center">
<bold>87.6</bold>
</td>
<td valign="top" align="center">
<bold>92.0</bold>
</td>
<td valign="top" align="center">
<bold>5.06</bold>
</td>
<td valign="top" align="center">1.59</td>
<td valign="top" align="center">81.3</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Confusion matrix of the PMVT model on the wheat dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g006.tif"/>
</fig>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Precision of the PMVT model on the wheat dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g007.tif"/>
</fig>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Recall of the PMVT model on the wheat dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g008.tif"/>
</fig>
<p>The coffee dataset was used to compare the performance of the PMVT models in the field environment. As can be seen from <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>, the traditional lightweight networks did not achieve acceptable accuracy rates. The XXS version of the PMVT model achieved a top-1 accuracy rate of 85.4%, which was 3.5% higher than that of the SqueezeNet-1.1 model. Compared with the MobileFormer-96M model, the XS version of the PMVT model improved accuracy by 2.3% to reach 86.5%. Finally, the S version of the PMVT model achieved an accuracy rate of 87.6% on this dataset; this was an improvement of 2.2% over that obtained by the PoolFormer-S12 model. <xref ref-type="fig" rid="f9">
<bold>Figures&#xa0;9</bold>
</xref> and <xref ref-type="fig" rid="f10">
<bold>10</bold>
</xref> present the confusion matrix, precision, and recall of the PMVT model. It can be seen from the figures that our model does not achieve satisfactory results in identifying red spider mite diseases.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Confusion matrix of the PMVT model on the coffee dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g009.tif"/>
</fig>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Precision and recall of the PMVT model on the coffee dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g010.tif"/>
</fig>
<p>We applied the rice dataset to simultaneously testing the fitting ability of the PMVT model in a controlled laboratory environment and in a real natural condition. Surprisingly, the XS version of PMVT achieved 97.7% accuracy on this dataset, which was 5.8% higher than the second-highest accuracy (obtained by the MobileNetV3-large model). In addition, the XXS version attained an accuracy of 93.1%, which was 3.4% higher than the baseline of the MobileNetV3-small model. The S version of the PMVT model performed the worst, with an accuracy of 92%; however, it still outperformed the ShuffleNetV2-2.0 model with similar parameters by 0.6%. Upon comparing models with similar sizes, we found that the PMVT model has achieved the best accuracy rate. This proved that our model is very competitive on the classic binary classification problem. <xref ref-type="fig" rid="f11">
<bold>Figures&#xa0;11</bold>
</xref> and <xref ref-type="fig" rid="f12">
<bold>12</bold>
</xref> depict the confusion matrix, precision, and recall of the PMVT model on the rice dataset.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Confusion matrix of the PMVT model on the rice dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g011.tif"/>
</fig>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Precision and recall of the PMVT model on the rice dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1256773-g012.tif"/>
</fig>
<p>As seen in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>, our method does not excel in terms of FPS and FLOPs metrics. This because the self-attention mechanism computes the weights between image patches, resulting in numerous matrix calculations and multiplication operations during inference. Consequently, this increases the computational time. Additionally, because of the current immaturity of deep learning framework technology, numerous attention-weight matrices must be stored and processed, thereby occupying a significant amount of memory. Nevertheless, PMVT achieves the best accuracy with only 0.98M parameters. This makes it low-cost and high-accuracy for plant disease identification. As artificial intelligence technology advances, ViT can be better applied to the visual task of plant disease identification.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Ablation studies</title>
<p>The data given in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>, it demonstrates the effectiveness of each module in our models. +Conv7 &#xd7; 7 represents using a convolution kernel of size 7 instead of the 3 &#xd7; 3 convolution in the CNN block based on the MobileViT model. +CBAM uses channel attention and spatial attention integrated in the ViT block based on the MobileViT model. PMVT represents a new backbone network built on the basis of MobileViT using both 7 &#xd7; 7 convolution kernels and CBAM modules. It can be seen that each component can improve the accuracy of the model to varying degrees.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Ablation experiments investigating each component in the PMVT model (bold text highlights the best-performing network).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Methods</th>
<th valign="top" align="left">Wheat(%)</th>
<th valign="top" align="left">Coffee(%)</th>
<th valign="top" align="left">Rice(%)</th>
<th valign="top" align="left">Params(M)</th>
<th valign="top" align="left">FLOPs(G)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">MobileViT-XXS</td>
<td valign="top" align="left">91.4</td>
<td valign="top" align="left">83.1</td>
<td valign="top" align="left">92.0</td>
<td valign="top" align="left">0.96</td>
<td valign="top" align="left">0.27</td>
</tr>
<tr>
<td valign="top" align="left">+Conv7x7</td>
<td valign="top" align="left">92.2(+0.8)</td>
<td valign="top" align="left">84.0(+1.1)</td>
<td valign="top" align="left">92.8(+0.8)</td>
<td valign="top" align="left">0.97(+0.01)</td>
<td valign="top" align="left">0.30(+0.03)</td>
</tr>
<tr>
<td valign="top" align="left">+CBAM</td>
<td valign="top" align="left">92.5(+1.1)</td>
<td valign="top" align="left">84.1(+1.0)</td>
<td valign="top" align="left">92.6(+0.6)</td>
<td valign="top" align="left">0.97(+0.01)</td>
<td valign="top" align="left">0.27</td>
</tr>
<tr>
<td valign="top" align="left">PMVT-XXS</td>
<td valign="top" align="left">
<bold>93.6(+2.2)</bold>
</td>
<td valign="top" align="left">
<bold>85.3(+2.1)</bold>
</td>
<td valign="top" align="left">
<bold>93.1(+1.1)</bold>
</td>
<td valign="top" align="left">
<bold>0.98(+0.02)</bold>
</td>
<td valign="top" align="left">
<bold>0.31(+0.04)</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">MobileViT-XS</td>
<td valign="top" align="left">93.3</td>
<td valign="top" align="left">84.2</td>
<td valign="top" align="left">94.2</td>
<td valign="top" align="left">1.94</td>
<td valign="top" align="left">0.74</td>
</tr>
<tr>
<td valign="top" align="left">+Conv7x7</td>
<td valign="top" align="left">93.9(+0.6)</td>
<td valign="top" align="left">85.3(+1.1)</td>
<td valign="top" align="left">95.8(+1.6)</td>
<td valign="top" align="left">1.99(+0.05)</td>
<td valign="top" align="left">0.84(+0.1)</td>
</tr>
<tr>
<td valign="top" align="left">+CBAM</td>
<td valign="top" align="left">93.6(+0.3)</td>
<td valign="top" align="left">85.6(+1.4)</td>
<td valign="top" align="left">96.5(+2.3)</td>
<td valign="top" align="left">1.95(+0.01)</td>
<td valign="top" align="left">0.76(+0.02)</td>
</tr>
<tr>
<td valign="top" align="left">PMVT-XS</td>
<td valign="top" align="left">
<bold>94.7(+1.4)</bold>
</td>
<td valign="top" align="left">
<bold>86.5(+2.3)</bold>
</td>
<td valign="top" align="left">
<bold>97.7(+3.5)</bold>
</td>
<td valign="top" align="left">
<bold>2.01(+0.07)</bold>
</td>
<td valign="top" align="left">
<bold>0.85(+0.11)</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">MobileViT-S</td>
<td valign="top" align="left">93.9</td>
<td valign="top" align="left">84.3</td>
<td valign="top" align="left">89.7</td>
<td valign="top" align="left">4.95</td>
<td valign="top" align="left">1.46</td>
</tr>
<tr>
<td valign="top" align="left">+Conv7x7</td>
<td valign="top" align="left">94.4(+0.5)</td>
<td valign="top" align="left">85.4(+1.1)</td>
<td valign="top" align="left">90.9(+1.2)</td>
<td valign="top" align="left">5.02(+0.07)</td>
<td valign="top" align="left">1.59(+0.13)</td>
</tr>
<tr>
<td valign="top" align="left">+CBAM</td>
<td valign="top" align="left">94.4(+0.5)</td>
<td valign="top" align="left">84.7(+1.4)</td>
<td valign="top" align="left">91.1(+1.4)</td>
<td valign="top" align="left">4.98(+0.03)</td>
<td valign="top" align="left">1.47(+0.01)</td>
</tr>
<tr>
<td valign="top" align="left">PMVT-S</td>
<td valign="top" align="left">
<bold>94.9(+1.0)</bold>
</td>
<td valign="top" align="left">
<bold>87.6(+3.3)</bold>
</td>
<td valign="top" align="left">
<bold>92.0(+2.3)</bold>
</td>
<td valign="top" align="left">
<bold>5.06(+0.11)</bold>
</td>
<td valign="top" align="left">
<bold>1.59(+0.13)</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_3" sec-type="conclusions">
<label>3.3</label>
<title>Conclusion</title>
<p>In this paper, we constructed a computationally efficient vision transformer (ViT) model, referred to as PMVT, for the identification of plant diseases. Furthermore, larger convolution kernels and CBAM modules enhanced the model&#x2019;s feature extraction capability. Comparative experiments were conducted on multiple datasets containing images of plant diseases, thus demonstrating that PMVT outperforms both lightweight and heavyweight networks. Additionally, PMVT outperforms both lightweight and heavyweight networks. PMVT has more powerful generalization capabilities and can be deployed on mobile devices for diagnosing plant diseases in field environments. However, due to the shorter development time of ViT, lightweight ViT models are comparatively slower than traditional lightweight CNNs when processing images. The advancement of deep learning framework technology enables ViT to perform computer vision tasks more effectively.</p>
</sec>
</sec>
<sec id="s4" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s5" sec-type="author-contributions">
<title>Author contributions</title>
<p>GL: Conceptualization, Methodology, Writing &#x2013; review &amp; editing. YW: Software, Visualization, Writing &#x2013; original draft. QZ: Data curation, Validation, Writing &#x2013; review &amp; editing. PY: Resources, Writing &#x2013; review &amp; editing. BC: Funding acquisition, Supervision, Writing &#x2013; review &amp; editing.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="funding-information">
<title>Funding</title>
<p>The authors declare financial support was received for the research, authorship, and/or publication of this article. This research was funded by the major project of science and technology of Henan Province (Grant No. 221100110800), the independent innovation project of Henan Academy of Agricultural Sciences (Grant No. 2023ZC067), and the innovation team of Agricultural information technology (Grant No. 2023TD10).</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>We thank LetPub (<ext-link ext-link-type="uri" xlink:href="http://www.letpub.com">www.letpub.com</ext-link>) for its linguistic assistance during the preparation of this manuscript.</p>
</ack>
<sec id="s7" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s8" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahmad</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Saraswat</surname> <given-names>D.</given-names>
</name>
<name>
<surname>El Gamal</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A survey on using deep learning techniques for plant disease diagnosis and recommendations for development of appropriate tools</article-title>. <source>Smart Agric. Technol.</source> <volume>3</volume>, <elocation-id>100083</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.atech.2022.100083</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Akshai</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Anitha</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Plant disease classification using deep learning</article-title>,&#x201d; in <conf-name>2021 3rd International Conference on Signal Processing and Communication (ICPSC) (IEEE)</conf-name>. <fpage>407</fpage>&#x2013;<lpage>411</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Lightweight convolutional neural network model for field wheat ear disease identification</article-title>. <source>Comput. Electron. Agric.</source> <volume>189</volume>, <fpage>106367</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compag.2021.106367</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). &#x201c;<article-title>Mobile-former: Bridging mobilenet and transformer</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>5270</fpage>&#x2013;<lpage>5279</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Weissenborn</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Transformer in transformer</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>34</volume>, <fpage>15908</fpage>&#x2013;<lpage>15919</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hasan</surname> <given-names>R. I.</given-names>
</name>
<name>
<surname>Yusuf</surname> <given-names>S. M.</given-names>
</name>
<name>
<surname>Alzubaidi</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Review of the state of the art of deep learning for plant diseases: A broad analysis and discussion</article-title>. <source>Plants</source> <volume>9</volume>, <elocation-id>1302</elocation-id>. doi: <pub-id pub-id-type="doi">10.3390/plants9101302</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Howard</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). &#x201c;<article-title>Searching for mobilenetv3</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. <fpage>1314</fpage>&#x2013;<lpage>1324</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hughes</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Salathe</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>An open access repository of images on plant health to enable the development of mobile disease diagnostics</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1511.08060</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Iandola</surname> <given-names>F. N.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Moskewicz</surname> <given-names>M. W.</given-names>
</name>
<name>
<surname>Ashraf</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Dally</surname> <given-names>W. J.</given-names>
</name>
<name>
<surname>Keutzer</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Squeezenet: Alexnet-level accuracy with 50x fewer parameters and 0.5 mb model size</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1602.07360</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Naseer</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Hayat</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zamir</surname> <given-names>S. W.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>F. S.</given-names>
</name>
<name>
<surname>Shah</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Transformers in vision: A survey</article-title>. <source>ACM Computing Surveys (CSUR)</source> <volume>54</volume>, <fpage>1</fpage>&#x2013;<lpage>41</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3505244</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lian</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Wheat disease classification</article-title>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A survey of transformers</article-title>. <source>AI Open</source>. <volume>3</volume>, <page-range>111&#x2013;132</page-range>. doi: <pub-id pub-id-type="doi">10.1016/j.aiopen.2022.10.001</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>.</citation>
</ref>
<ref id="B16">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Feichtenhofer</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Darrell</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>A convnet for the 2020s</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>11976</fpage>&#x2013;<lpage>11986</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>A hybrid model of ghost-convolution enlightened transformer for effective diagnosis of grape leaf disease and pest</article-title>. <source>J. King Saud University-Computer Inf. Sci.</source> <volume>34</volume>, <fpage>1755</fpage>&#x2013;<lpage>1767</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jksuci.2022.03.006</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ma</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>H.-T.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Shufflenet v2: Practical guidelines for efficient cnn architecture design</article-title>,&#x201d; in <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name>. <fpage>116</fpage>&#x2013;<lpage>131</lpage>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mehta</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Rastegari</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2110.02178</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Parraga-Alava</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cusme</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Loor</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Santander</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Rocole: A robusta coffee leaf images dataset for evaluation of machine learning based methods in plant diseases recognition</article-title>. <source>Data Brief</source> <volume>25</volume>, <fpage>104414</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.dib.2019.104414</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Savary</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Willocquet</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Pethybridge</surname> <given-names>S. J.</given-names>
</name>
<name>
<surname>Esker</surname> <given-names>P.</given-names>
</name>
<name>
<surname>McRoberts</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Nelson</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>The global burden of pathogens and pests on major food crops</article-title>. <source>Nat. Ecol. Evol.</source> <volume>3</volume>, <fpage>430</fpage>&#x2013;<lpage>439</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41559-018-0793-y</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sethy</surname> <given-names>P. K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Rice leaf disease image samples</article-title>. <source>Mendeley Data</source> <volume>1</volume>.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sheng</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Ruan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Disease diagnostic method based on cascade backbone network for apple leaf disease classification</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>994227</elocation-id>. doi: <pub-id pub-id-type="doi">10.3389/fpls.2022.994227</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Le</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Efficientnet: Rethinking model scaling for convolutional neural networks</article-title>,&#x201d; in <conf-name>International conference on machine learning (PMLR)</conf-name>. <fpage>6105</fpage>&#x2013;<lpage>6114</lpage>.</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thakur</surname> <given-names>P. S.</given-names>
</name>
<name>
<surname>Sheorey</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Ojha</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Vgg-icnn: A lightweight cnn model for crop disease identification</article-title>. <source>Multimedia Tools Appl.</source> <volume>82</volume>, <fpage>497</fpage>&#x2013;<lpage>520</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11042-022-13144-z</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Touvron</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Cord</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Douze</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Massa</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Sablayrolles</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Jegou</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Training dataefficient image transformers &amp; distillation through attention</article-title>,&#x201d; in <conf-name>International conference on machine learning (PMLR)</conf-name>. <fpage>10347</fpage>&#x2013;<lpage>10357</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vaswani</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Shazeer</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Parmar</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Uszkoreit</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jones</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Gomez</surname> <given-names>A. N.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Attention is all you need</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>30</volume>.</citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Woo</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Park</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J.-Y.</given-names>
</name>
<name>
<surname>Kweon</surname> <given-names>I. S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Cbam: Convolutional block attention module</article-title>,&#x201d; in <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name>. <fpage>3</fpage>&#x2013;<lpage>19</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Codella</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Cvt: Introducing convolutions to vision transformers</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>. <fpage>22</fpage>&#x2013;<lpage>31</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Shu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A review of plant phenotypic image recognition technology based on deep learning</article-title>. <source>Electronics</source> <volume>10</volume>, <fpage>81</fpage>. doi: <pub-id pub-id-type="doi">10.3390/electronics10010081</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Heidari</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>a). <article-title>Apple leaf disease recognition method with improved residual network</article-title>. <source>Multimedia Tools Appl.</source> <volume>81</volume>, <fpage>7759</fpage>&#x2013;<lpage>7782</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11042-022-11915-2</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Si</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>b). &#x201c;<article-title>Metaformer is actually what you need for vision</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <fpage>10819</fpage>&#x2013;<lpage>10829</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Inception convolutional vision transformers for plant disease identification</article-title>. <source>Internet Things</source> <volume>21</volume>, <fpage>100650</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.iot.2022.100650</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>