<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2024.1352935</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Semantic segmentation of microbial alterations based on SegFormer</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Elmessery</surname>
<given-names>Wael M.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2618887"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Maklakov</surname>
<given-names>Danil V.</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2633011"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>El-Messery</surname>
<given-names>Tamer M.</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1789201"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Baranenko</surname>
<given-names>Denis A.</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1109164"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Guti&#xe9;rrez</surname>
<given-names>Joaqu&#xed;n</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Shams</surname>
<given-names>Mahmoud Y.</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1793492"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>El-Hafeez</surname>
<given-names>Tarek Abd</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Elsayed</surname>
<given-names>Salah</given-names>
</name>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/533880"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Alhag</surname>
<given-names>Sadeq K.</given-names>
</name>
<xref ref-type="aff" rid="aff8">
<sup>8</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1883016"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Moghanm</surname>
<given-names>Farahat S.</given-names>
</name>
<xref ref-type="aff" rid="aff9">
<sup>9</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2600206"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mulyukin</surname>
<given-names>Maksim A.</given-names>
</name>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Petrova</surname>
<given-names>Yuliya Yu.</given-names>
</name>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Elwakeel</surname>
<given-names>Abdallah E.</given-names>
</name>
<xref ref-type="aff" rid="aff11">
<sup>11</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2600225"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Agricultural Engineering Department, Faculty of Agriculture, Kafrelsheikh University</institution>, <addr-line>Kafr El-Sheikh</addr-line>, <country>Egypt</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Engineering Group, Centro de Investigaciones Biol&#xf3;gicas del Noroeste</institution>, <addr-line>La Paz, Baja California Sur</addr-line>, <country>Mexico</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>International Research Centre &#x201c;Biotechnologies of the Third Millennium&#x201d;, Faculty of Biotechnologies (BioTech), ITMO University</institution>, <addr-line>St. Petersburg</addr-line>, <country>Russia</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Machine Learning and Information Retrieval, Faculty of Artificial Intelligence, Kafrelsheikh University</institution>, <addr-line>Kafr El-Sheikh</addr-line>, <country>Egypt</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Department of Computer Science, Faculty of Science, Minia University</institution>, <addr-line>Minia</addr-line>, <country>Egypt</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Computer Science Unit, Deraya University, Minia University</institution>, <addr-line>Minia</addr-line>, <country>Egypt</country>
</aff>
<aff id="aff7">
<sup>7</sup>
<institution>Agricultural Engineering, Evaluation of Natural Resources Department, Environmental Studies and Research Institute, University of Sadat City</institution>, <addr-line>Sadat City</addr-line>, <country>Egypt</country>
</aff>
<aff id="aff8">
<sup>8</sup>
<institution>Biology Department, College of Science and Arts, King Khalid University</institution>, <addr-line>Abha</addr-line>, <country>Saudi Arabia</country>
</aff>
<aff id="aff9">
<sup>9</sup>
<institution>Soil and Water Department, Faculty of Agriculture, Kafrelsheikh University</institution>, <addr-line>Kafr El-Sheikh</addr-line>, <country>Egypt</country>
</aff>
<aff id="aff10">
<sup>10</sup>
<institution>Institute of Natural and Technical Sciences, Surgut State University</institution>, <addr-line>Surgut</addr-line>, <country>Russia</country>
</aff>
<aff id="aff11">
<sup>11</sup>
<institution>Agricultural Engineering Department, Faculty of Agriculture and Natural Resources, Aswan University</institution>, <addr-line>Aswan</addr-line>, <country>Egypt</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Ning Yang, Jiangsu University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Parvathaneni Naga Srinivasu, Prasad V. Potluri Siddhartha Institute of Technology, India</p>
<p>Dongmei Chen, Hangzhou Dianzi University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Wael M. Elmessery, <email xlink:href="mailto:wael.elmessery@gmail.com">wael.elmessery@gmail.com</email>; Tamer M. El-Messery, <email xlink:href="mailto:tmelmessery@outlook.com">tmelmessery@outlook.com</email>; Denis A. Baranenko, <email xlink:href="mailto:denis.baranenko@itmo.ru">denis.baranenko@itmo.ru</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>13</day>
<month>06</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1352935</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>12</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>24</day>
<month>05</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Elmessery, Maklakov, El-Messery, Baranenko, Guti&#xe9;rrez, Shams, El-Hafeez, Elsayed, Alhag, Moghanm, Mulyukin, Petrova and Elwakeel</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Elmessery, Maklakov, El-Messery, Baranenko, Guti&#xe9;rrez, Shams, El-Hafeez, Elsayed, Alhag, Moghanm, Mulyukin, Petrova and Elwakeel</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Precise semantic segmentation of microbial alterations is paramount for their evaluation and treatment. This study focuses on harnessing the SegFormer segmentation model for precise semantic segmentation of strawberry diseases, aiming to improve disease detection accuracy under natural acquisition conditions.</p>
</sec>
<sec>
<title>Methods</title>
<p>Three distinct Mix Transformer encoders - MiT-B0, MiT-B3, and MiT-B5 - were thoroughly analyzed to enhance disease detection, targeting diseases such as Angular leaf spot, Anthracnose rot, Blossom blight, Gray mold, Leaf spot, Powdery mildew on fruit, and Powdery mildew on leaves. The dataset consisted of 2,450 raw images, expanded to 4,574 augmented images. The Segment Anything Model integrated into the Roboflow annotation tool facilitated efficient annotation and dataset preparation.</p>
</sec>
<sec>
<title>Results</title>
<p>The results reveal that MiT-B0 demonstrates balanced but slightly overfitting behavior, MiT-B3 adapts rapidly with consistent training and validation performance, and MiT-B5 offers efficient learning with occasional fluctuations, providing robust performance. MiT-B3 and MiT-B5 consistently outperformed MiT-B0 across disease types, with MiT-B5 achieving the most precise segmentation in general.</p>
</sec>
<sec>
<title>Discussion</title>
<p>The findings provide key insights for researchers to select the most suitable encoder for disease detection applications, propelling the field forward for further investigation. The success in strawberry disease analysis suggests potential for extending this approach to other crops and diseases, paving the way for future research and interdisciplinary collaboration.</p>
</sec>
</abstract>
<kwd-group>
<kwd>computer vision</kwd>
<kwd>mix transformer encoders</kwd>
<kwd>disease detection</kwd>
<kwd>smart agriculture</kwd>
<kwd>food safety</kwd>
</kwd-group>
<contract-sponsor id="cn001">Russian Science Foundation<named-content content-type="fundref-id">10.13039/501100006769</named-content>
</contract-sponsor>
<counts>
<fig-count count="10"/>
<table-count count="6"/>
<equation-count count="7"/>
<ref-count count="44"/>
<page-count count="20"/>
<word-count count="9346"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>As artificial intelligence continues to find applications in diverse domains, the field of agricultural science is no exception. Computer vision methodologies have been introduced to various tasks related to plant image analysis. These tasks encompass plant classification, as demonstrated in the works of <xref ref-type="bibr" rid="B3">Barre et&#xa0;al. (2017)</xref> and <xref ref-type="bibr" rid="B29">W&#xe4;ldchen and M&#xe4;der (2018)</xref>, as well as the detection of plant diseases and pests, as evidenced by <xref ref-type="bibr" rid="B27">Shruthi et&#xa0;al. (2019)</xref> and <xref ref-type="bibr" rid="B7">Chouhan et&#xa0;al. (2020)</xref>.</p>
<p>The detection of plant diseases and pests has garnered substantial interest, mainly centering around deep-learning-driven computer vision techniques. Distinct from traditional computer vision models that rely on human-crafted image features, these modern approaches display enhanced robustness to environmental disparities, attributable to extensive training on expansive datasets. The Egyptian agricultural economy has witnessed a surge in prominence pertaining to strawberry farming, attributed largely to the nation&#x2019;s auspicious climate and fertile lands located in regions like Wadi El Natroun, El Beheira, and Fayoum. Ideal for strawberry cultivation, these territories accommodate bountiful harvests annually from November to April. Spearheading strawberry production in the Middle East and North African region, Egypt recorded a yield of approximately 597.03 thousand tons in 2020. Export trends indicate a steady flow of strawberry shipments, predominantly directed towards European markets, culminating in a figure of 24.72 thousand tons in 2022 (<xref ref-type="bibr" rid="B28">TRIDGE, 2023</xref>).</p>
<p>While the strawberry industry in Egypt has experienced growth, it faces certain challenges, such as the need for improved pest and disease management practices. Detecting plant diseases at their initial stages can significantly reduce the need for potentially harmful chemicals and minimize labor expenses associated with managing afflicted plants. Even experienced farmers can face challenges in identifying diseases in large greenhouse settings before they propagate. Hence, an automated disease detection system will serve as a valuable complement to farmers&#x2019; expertise and effort. Timely detection and accurate identification of pests are crucial not only for preventing crop damage, but also for avoiding the incorrect and excessive application of pesticide sprays (<xref ref-type="bibr" rid="B10">Dong et&#xa0;al., 2021</xref>). From the analysis of various datasets related to strawberry diseases, we have identified the presence of seven distinct diseases: Leaf spot (<italic>Mycosphaerella fragariae</italic>), Angular leaf spot (<italic>Xanthomonas fragariae</italic>), Anthracnose rot (<italic>Colletotrichum acutatum</italic>), Blossom blight (<italic>Monilinia fructicola</italic>), Gray mold (<italic>Botrytis cinerea)</italic>, Powdery mildew on fruit (<italic>Podosphaera aphanis</italic>), and Powdery mildew on leaves (<italic>Podosphaera macularis</italic>). Efficient and accurate segmentation of leaf disease represents a significant area of research. To tackle this challenge, a wide range of computer vision segmentation methods have been employed, leveraging image attributes like hue, texture, form, and spatial information (<xref ref-type="bibr" rid="B21">Pugoy and Mariano, 2011</xref>; <xref ref-type="bibr" rid="B23">Revathi and Hemalatha, 2012</xref>; <xref ref-type="bibr" rid="B32">Wang et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B9">Deenan et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B42">Zhao et&#xa0;al., 2020</xref>). However, these conventional techniques come with inherent limitations and typically require a significant amount of time. The emergence of deep learning models marks a transformative era for segmenting images. <xref ref-type="bibr" rid="B16">Li et al. (2023)</xref> introduced a network grounded in copy-paste techniques and SegFormer, showcasing its prowess in precise segmentation of disease regions and evaluation of their severity, marked by mean intersection over union of 85.38%. <xref ref-type="bibr" rid="B33">Wu et al. (2023)</xref> further refined the landscape by enhancing DETR, leading to the efficient segmentation of tomato leaf disease spots and achieving an impressive accuracy of 96.40%. <xref ref-type="bibr" rid="B43">Zhao et al. (2022)</xref> proposed a multiple disease detection method for greenhouse-cultivated strawberry based on multiscale feature fusion Faster R-CNN.</p>
<p>In a comprehensive investigation conducted by (<xref ref-type="bibr" rid="B19">Minaee et&#xa0;al., 2020</xref>) an extensive evaluation of segmentation approaches based on deep learning presented in 2019 was carried out. Notably, Convolutional Neural Networks (CNNs) have been extensively utilized in tasks related to the segmentation of agricultural diseases. They have proven instrumental in enhancing the precision of disease spot identification and significantly expanding the range of potential applications (<xref ref-type="bibr" rid="B14">Jiang et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B8">Craze et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B37">Yao et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B39">Yong et&#xa0;al., 2023</xref>).</p>
<p>Transformers have shown superior performance compared to convolutional neural networks, achieving state-of-the-art results with fewer parameters and higher computational efficiency (<xref ref-type="bibr" rid="B12">Fan and Liu, 2023</xref>). Transformers, particularly self-attention modules, provide efficient object detection models and improve detection accuracy in deep foggy conditions (<xref ref-type="bibr" rid="B26">Shit et&#xa0;al., 2023</xref>). They also offer consistent, albeit modest, performance improvements when added to state-of-the-art segmentation models for overhead imagery (<xref ref-type="bibr" rid="B18">Luzi et&#xa0;al., 2023</xref>). However, transformers have some limitations. They are difficult to train and have lower performance on small datasets compared to convolutional neural networks (<xref ref-type="bibr" rid="B5">Chen and Feng, 2023</xref>). Fully transformer-based models may achieve relatively poor performance, while hybrid models that combine convolutional and transformer-based structures show better results.</p>
<p>Transformer-based architectures can be adapted to handle other visual tasks, such as object detection and segmentation, by leveraging their self-attention mechanism and hierarchical feature representation capabilities. These architectures have shown remarkable advancements in visual segmentation tasks, surpassing previous convolutional or recurrent approaches (<xref ref-type="bibr" rid="B13">Gao et&#xa0;al., 2023</xref>).</p>
<p>In the realm of semantic segmentation for agricultural diseases, a series of transformative visual networks based on Transformers has unfolded, showcasing notable advancements. The journey begins with the inception of models like Detection Transformer (DETR) (<xref ref-type="bibr" rid="B4">Carion et&#xa0;al., 2020</xref>), Vision Transformer (ViT) (<xref ref-type="bibr" rid="B11">Dosovitskiy et&#xa0;al., 2020</xref>), Swin Transformer (SwinT) (<xref ref-type="bibr" rid="B17">Liu et&#xa0;al., 2021</xref>), Semantic Transformation model (SETR) (<xref ref-type="bibr" rid="B44">Zheng et&#xa0;al., 2021</xref>), and SegFormer (<xref ref-type="bibr" rid="B35">Xie et&#xa0;al., 2021</xref>). Building on this foundation (<xref ref-type="bibr" rid="B31">Wang et&#xa0;al., 2022</xref>), elevated the SwinT network, employing it for identifying real plant leaf diseases, (<xref ref-type="bibr" rid="B34">Wu et&#xa0;al., 2022</xref>) further refined the landscape by enhancing DETR, leading to the efficient segmentation of tomato leaf disease spots and achieving an impressive accuracy of 96.40%. (<xref ref-type="bibr" rid="B22">Reedha et&#xa0;al., 2022</xref>) took a visionary leap by applying vision transformer (ViT) for categorizing weeds and crop images obtained from agricultural drones, outperforming traditional CNNs with an outstanding F1 score of 99.28%. In a pursuit of lightweight yet effective solutions, (<xref ref-type="bibr" rid="B15">Li et&#xa0;al., 2022</xref>) introduced a network grounded in copy&#x2013;paste techniques and SegFormer, showcasing its prowess in precise segmentation of disease regions and evaluation of their severity, marked by mean intersection over union of 85.38%. The narrative unfolds further with (<xref ref-type="bibr" rid="B40">Zhang et&#xa0;al., 2023</xref>), who suggested a specialized segmentation framework known as the Cross-Resolution Transformer, tailored for identifying the leaf disease of the grape in natural environments. Through these transformative steps, SegFormer emerges as a straightforward, effective, and resilient framework for semantic segmentation unifying Transformers with nimble multi-layer perceptron decoders, thereby contributing significantly to the evolving landscape of agricultural disease segmentation.</p>
<sec id="s1_1">
<label>1.1</label>
<title>Problem statement</title>
<p>Precise detection and segmentation of strawberry diseases are crucial for effective management and treatment. Traditional computer vision methods often fall short in accurately identifying diseases, particularly under natural acquisition conditions. Deep learning models, especially transformer-based architectures like SegFormer, offer promising solutions. However, selecting an appropriate mix transformer encoder for optimal performance remains a challenge. Moreover, the existing studies often lack in-depth analysis and comparison of different encoder variants in the context of disease detection accuracy. Therefore, this study aims to address these gaps by evaluating and enhancing the SegFormer segmentation model using three distinct Mix Transformer encoders (MiT-B0, MiT-B3, and MiT-B5) for precise identification and localization of various strawberry diseases.</p>
</sec>
<sec id="s1_2">
<label>1.2</label>
<title>Contributions</title>
<p>This study explores the potential of SegFormer, a powerful segmentation model, for accurately detecting and distinguishing seven strawberry diseases. Three Mix Transformer encoders within SegFormer were investigated and their performance, adaptability, and impact on disease detection were analyzed. The main contributions can be summarized as follows:</p>
<list list-type="bullet">
<list-item>
<p>
<bold>Hybrid model design:</bold> A novel hybrid model leverages the strengths of both Mix Transformer encoders and SegFormer architecture for effective disease segmentation while mitigating overfitting and generalization issues.</p>
</list-item>
<list-item>
<p>
<bold>Extensive dataset:</bold> Experiments are conducted on a diverse dataset of 4,574 augmented images, ensuring balanced class representation and enabling robust performance assessment under various disease scenarios.</p>
</list-item>
<list-item>
<p>
<bold>Quantitative and qualitative results:</bold> Using metrics like mIoU and MPA, superior performance compared to the existing methods is demonstrated. Visual examples further confirm the model&#x2019;s robustness and practical value.</p>
</list-item>
<list-item>
<p>
<bold>State-of-the-art performance:</bold> This approach achieves outstanding accuracy, efficiency, and reduced model complexity compared to the established models, making SegFormer a strong contender for real-world applications in strawberry disease detection.</p>
</list-item>
<list-item>
<p>
<bold>Insights and future directions:</bold> Valuable insights into the relationship between encoders and SegFormer performance are provided, guiding researchers in model fine-tuning and tailored strategies for diverse agricultural challenges.</p>
</list-item>
<list-item>
<p>
<bold>Wider applicability:</bold> The success in strawberry disease analysis suggests potential for extending this approach to other crops and diseases, paving the way for future research and interdisciplinary collaboration.</p>
</list-item>
</list>
<p>The remainder of the paper is structured as follows: Section 2 reviews previous research to provide context and familiarize readers with the current state of knowledge in the field. Section 3 delves into the materials, methods, and specifics of the proposed model, laying the groundwork for understanding the subsequent experiments. In Section 4, experimental results are presented to demonstrate the proposed model&#x2019;s performance under various conditions. Section 5 discusses limitations encountered during the research process, promoting transparency and encouraging critical examination. Finally, Section 6 consolidates conclusions drawn from the experimental results and suggests potential avenues for future research.</p>
</sec>
<sec id="s1_3">
<label>1.3</label>
<title>Related work</title>
<p>The research evaluating the severity of plant diseases using Convolutional Neural Networks (CNNs) primarily focused on two main approaches. The first category involves techniques centered around image segmentation, while the second focuses on enhancing CNNs, predominantly by incorporating the Attention Mechanism (<xref ref-type="bibr" rid="B20">Naga Srinivasu et&#xa0;al., 2020</xref>).</p>
<p>Segmentation-based methods typically utilize popular segmentation networks like DeepLabV3+, U-Net, PSPNet, and Mask R-CNN. For instance, <xref ref-type="bibr" rid="B31">Wang et&#xa0;al. (2022)</xref> refined the SwinT network for data augmentation and identifying actual cucumber leaf diseases. Meanwhile, <xref ref-type="bibr" rid="B34">Wu et&#xa0;al. (2022)</xref> obtained a remarkable disease classification accuracy of 96.40% for <italic>tomat</italic> eaf diseases by implementing various improvements to DETR. Additionally, <xref ref-type="bibr" rid="B22">Reedha et&#xa0;al. (2022)</xref> leveraged ViT to classify weed and crop images acquired via Unmanned Aerial Vehicles, resulting in an outstanding F1 score of 99.28%. Lastly, <xref ref-type="bibr" rid="B15">Li et&#xa0;al. (2022)</xref> put forth a lightweight network grounded in copy&#x2013;paste and SegFormer for precise disease-region segmentation and severity assessment, yielding a MIoU of 85.38%.</p>
<p>Aside from segmentation-focused methods, researchers explored alternative ways to improve CNNs, mainly concentrating on introducing the Attention Mechanism. <xref ref-type="bibr" rid="B40">Zhang et&#xa0;al. (2023)</xref> utilized a three-stage methodology to classify &#x201c;Huangguan&#x201d; pears. Initially, Mask R-CNN facilitated the segmentation of &#x201c;Huangguan&#x201d; pears from intricate backdrops; subsequently, DeepLabV3+, U-Net, and PSPNet served to segment &#x201c;Huangguan&#x201d; pear spots, calculating the proportion of spot area relative to the total number of pixels. This ratio was classified into three distinct grades. During the final phase, ResNet-50, VGG-16, and MobileNetV3 contributed to determining the pear&#x2019;s condition level.</p>
<p>
<xref ref-type="bibr" rid="B17">Liu et&#xa0;al. (2021)</xref> applied a staged segmentation concept. First, they separated apple leaves from complicated environments using a deep learning algorithm before detecting the affected regions on the isolated leaves. Subsequently, they gauged the severity of illnesses by computing the ratio of damaged tissue to the entire leaf area.</p>
<p>Moreover, the Attention Mechanism gained prominence in recent studies. <xref ref-type="bibr" rid="B38">Yin et&#xa0;al. (2022)</xref> modified the DCNN through integration of multi-scale and Attention Mechanisms, ultimately realizing maize small leaf spot classification. Separately, <xref ref-type="bibr" rid="B17">Liu et&#xa0;al. (2021)</xref> combined multi-scale convolution kernels and Coordinate Attention Mechanism in SqueezeNext to estimate illness severity, leading to a 3.02% improvement over the initial SqueezeNext model.</p>
</sec>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Experimental environment</title>
<p>In this study, publicly accessible datasets were utilized, specifically the Kaggle Dataset (<xref ref-type="bibr" rid="B1">Afzaal et&#xa0;al., 2021</xref>), to create a customized dataset tailored to the training and evaluation requirements of this study. The input image size was standardized to 128x128 pixels. However, it is important to note that the original images in the dataset had varying resolutions. Initially, the Kaggle Dataset comprised 1972 images encompassing seven distinct strawberry diseases. By employing an augmentation process, the overall dataset size was substantially expanded, resulting in a total of 4574 images available in two resolutions: 512 X 512 pixels and 640 X 640 pixels. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> demonstrates a detailed breakdown of how these images were distributed across various disease categories, which provides a comprehensive overview of the dataset&#x2019;s composition.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Statistics of the Raw and Augmented Datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Disease No.</th>
<th valign="top" align="left">Disease name</th>
<th valign="top" align="left">Raw Images count</th>
<th valign="top" align="left">Augmented images count</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="left">Angular leaf spot</td>
<td valign="top" align="left">245</td>
<td valign="top" align="left">569</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="left">Anthracnose fruit rot</td>
<td valign="top" align="left">52&#x2013;156</td>
<td valign="top" align="left">118&#x2013;354</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="left">Blossom blight</td>
<td valign="top" align="left">119&#x2013;357</td>
<td valign="top" align="left">273&#x2013;819</td>
</tr>
<tr>
<td valign="top" align="left">4</td>
<td valign="top" align="left">Gray mold</td>
<td valign="top" align="left">254</td>
<td valign="top" align="left">606</td>
</tr>
<tr>
<td valign="top" align="left">5</td>
<td valign="top" align="left">Leaf spot</td>
<td valign="top" align="left">369</td>
<td valign="top" align="left">919</td>
</tr>
<tr>
<td valign="top" align="left">6</td>
<td valign="top" align="left">Powdery mildew fruit</td>
<td valign="top" align="left">79&#x2013;273</td>
<td valign="top" align="left">185&#x2013;555</td>
</tr>
<tr>
<td valign="top" align="left">7</td>
<td valign="top" align="left">Powdery mildew leaf</td>
<td valign="top" align="left">318</td>
<td valign="top" align="left">752</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Dataset annotation and preparation</title>
<p>In this study, the innovative Segment Anything Model (SAM) integrated into the Roboflow annotation tool (<xref ref-type="bibr" rid="B24">Roboflow, 2023</xref>) was utilized to expedite the annotation and preparation of a strawberry disease dataset. This integration allowed for swift annotation of complex strawberry disease instances using a smart polygon tool in the Roboflow editor. SAM demonstrated proficiency in handling intricate object boundaries found in various disease manifestations, enabling the efficient creation of accurate segmentation masks. This approach not only saved considerable time, but also ensured the precision and quality of the annotations. The integration of SAM into the Roboflow annotation tool proved to be a valuable asset, simplifying data preparation and enhancing the accuracy of the semantic segmentation task in this research.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Dataset Augmentation and preprocessing</title>
<p>A comprehensive set of augmentation techniques was employed to enhance the quality and diversity of the strawberry diseases&#x2019; dataset. Data augmentation was performed in all sets of training, validation and test. The augmentation processes included horizontal flips, which help the model adapt to different orientations. Additionally, hue adjustments within the range of -21 to +21&#xb0;, saturation variations from -5% to +5%, and brightness changes spanning from -25% to +25% were applied. These modifications contribute to the dataset robustness by simulating different lighting conditions and color variations. To introduce realistic imperfections, a blur with a maximum radius of 2.5 pixels and introduced noise, affecting up to 8% of the pixels, was incorporated. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> illustrates a representative example of applying various augmentation scenarios to powdery mildew leaf images. These augmentation strategies are presented in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref> and play a crucial role in improving the dataset variability and aiding the proposed SegFormer-based semantic segmentation model in effectively recognizing and classifying strawberry diseases.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Representative example of data augmentation scenarios on powdery mildew on leaves. The top row shows the original images, while the bottom row illustrates the images after resizing and data enrichment procedures. The first column displays images after adjustments, including a decrease in hue by -8&#xb0;, saturation by -1%, brightness by -12%, along with a 1px blur and 8% noise. The second column presents images after horizontal flipping, a 2&#xb0; hue increase, 5% saturation increase, 23% brightness increase, a 1.75px blur, and 3.25% noise. In the third column, images are shown following an increase in hue by 19&#xb0;, a 2% saturation boost, 23% brightness enhancement, along with 0.5px blur and 0.75% noise. The fourth column depicts images after a 19&#xb0; hue increase, and the fifth column shows images with a -18&#xb0; reduction in hue.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g001.tif"/>
</fig>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Augmentation methods and their respective settings.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Method</th>
<th valign="top" align="left">Settings</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Flip</td>
<td valign="top" align="left">Horizontal</td>
</tr>
<tr>
<td valign="top" align="left">Hue</td>
<td valign="top" align="left">Between -21&#xb0; and +21&#xb0;</td>
</tr>
<tr>
<td valign="top" align="left">Saturation</td>
<td valign="top" align="left">Between -5 and +5%</td>
</tr>
<tr>
<td valign="top" align="left">Brightness</td>
<td valign="top" align="left">Between -25 and +25%</td>
</tr>
<tr>
<td valign="top" align="left">Blur</td>
<td valign="top" align="left">Up to 2.5px</td>
</tr>
<tr>
<td valign="top" align="left">Noise</td>
<td valign="top" align="left">Up to 8% of pixels</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The process of dividing a dataset into training, validation, and test subsets is a fundamental step in deep learning model development, ensuring the model&#x2019;s generalizability and performance evaluation. In this study, a diverse dataset containing various plant diseases was analyzed. To achieve a balanced and representative split, the size of each class was considered. With 569 images of Angular Leaf Spot, 354 images of Anthracnose Fruit Rot, 819 images of Blossom Blight, 606 images of Gray Mold, 919 images of Leaf Spot, 555 images of Powdery Mildew Fruit, and 752 images of Powdery Mildew Leaf, the data were appropriately distributed. Typically, a common practice is to allocate a significant portion of the dataset to training, around 80&#x2013;90%, to allow the model to learn from a substantial amount of data. The validation set, which is usually 5&#x2013;10% of the data, is employed during model development to fine-tune hyperparameters and monitor training progress. The remaining portion, the test set, serves as an unseen dataset to evaluate the model performance objectively, as illustrated in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Dataset distribution.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Disease type</th>
<th valign="top" align="center">Total samples</th>
<th valign="top" align="center">Training size</th>
<th valign="top" align="center">Validation size</th>
<th valign="top" align="center">Test size</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Angular leaf spot</td>
<td valign="top" align="left">569</td>
<td valign="top" align="left">514</td>
<td valign="top" align="left">37</td>
<td valign="top" align="left">18</td>
</tr>
<tr>
<td valign="top" align="left">Anthracnose fruit rot</td>
<td valign="top" align="left">354</td>
<td valign="top" align="left">310</td>
<td valign="top" align="left">30</td>
<td valign="top" align="left">14</td>
</tr>
<tr>
<td valign="top" align="left">Blossom blight</td>
<td valign="top" align="left">273</td>
<td valign="top" align="left">225</td>
<td valign="top" align="left">26</td>
<td valign="top" align="left">18</td>
</tr>
<tr>
<td valign="top" align="left">Gray mold</td>
<td valign="top" align="left">606</td>
<td valign="top" align="left">534</td>
<td valign="top" align="left">58</td>
<td valign="top" align="left">18</td>
</tr>
<tr>
<td valign="top" align="left">Leaf spot</td>
<td valign="top" align="left">919</td>
<td valign="top" align="left">765</td>
<td valign="top" align="left">74</td>
<td valign="top" align="left">40</td>
</tr>
<tr>
<td valign="top" align="left">Powdery mildew fruit</td>
<td valign="top" align="left">555</td>
<td valign="top" align="left">470</td>
<td valign="top" align="left">50</td>
<td valign="top" align="left">33</td>
</tr>
<tr>
<td valign="top" align="left">Powdery mildew leaf</td>
<td valign="top" align="left">752</td>
<td valign="top" align="left">651</td>
<td valign="top" align="left">64</td>
<td valign="top" align="left">37</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Efficient Segmentation model training with PyTorch Lightning Framework</title>
<p>In this study, PyTorch Lightning was employed as a powerful deep learning framework to train a semantic segmentation model on a strawberry diseases dataset. PyTorch Lightning provided a streamlined and highly efficient platform for the training process. It abstracted the underlying complexities of training, concentrating on model architecture and experimentation. The use of Lightning structured training loops and integrated callbacks, such as early stopping and model checkpointing, enhanced productivity, while its built-in support for distributed training and reproducibility contributed to the robustness of this research. The resulting model, based on the Segformer architecture, demonstrated impressive performance in semantic segmentation, making PyTorch Lightning an invaluable component of the methodology of the study.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Early Stopping and model checkpointing</title>
<p>Two crucial techniques were employed in this study for enhancing the training of deep learning models: Early Stopping and Model Checkpointing. The Early Stopping callback is an invaluable addition to the training regimen. It continuously monitors the validation loss as the model learns, and its role is to identify when the progress plateaus. This is defined by such parameters as &#x2018;min_delta,&#x2019; which specifies the minimum change in validation loss to be considered as a meaningful improvement. If no substantial improvement is observed for a predefined number of consecutive epochs, set at 10 in the present study, Early Stopping steps in and terminates the training process, preventing unnecessary overfitting and saving valuable computational resources.</p>
<p>On the other hand, ModelCheckpoint plays a pivotal role in preserving the best version of the proposed model. By specifying &#x2018;save_top_k=1&#x2019; and monitoring the &#x2018;val_loss,&#x2019; it ensures that only the finest Model Checkpoint, the one with the lowest validation loss, is stored. This is crucial because it safeguards the model superior performance and provides a safety net in case of unforeseen interruptions during training. The harmonious interplay of Early Stopping and Model Checkpointing allows to train the proposed deep learning model efficiently, striking a balance between performance optimization and resource management.</p>
</sec>
<sec id="s2_6">
<label>2.6</label>
<title>The proposed model architecture</title>
<p>In this study, the SegFormer architecture was harnessed (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>) and fine-tuned for precise semantic segmentation and object detection. NVIDIA advanced SegFormer model, rooted in this architecture, was designed for specialized computer vision tasks. SegFormer core strength lies in its Transformer-based backbone, which excels at capturing contextual information in images. Its encoder-decoder structure and innovative Mix Feed-Forward Network (Mix-FFN) approach address positional encoding and model efficiency challenges, contributing to high-performance yet resource-efficient models.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>SegFormer Architecture Overview: The FFN indicates a Feed-Forward Network. H, W define the input image height and width. C defines the channel dimension in the MLP decoder and N_cls is the number of semantic classes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g002.tif"/>
</fig>
<p>Self-attention mechanisms, a hallmark of Transformer models, dynamically focus on relevant image regions. Fine-tuning, using a pre-trained model on the extensive ADE20K dataset, refines the model knowledge for the specific purpose of this study. The dataset diversity enhances the model proficiency in semantic segmentation and scene understanding.</p>
<p>For strawberry disease segmentation, MiT-B0 and MiT-B3 were tailored to handle 512x512 pixel images, while MiT-B5 was configured for 640x640 pixel images. These customizations suit the models to the unique demands of this task.</p>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> presents an overview of SegFormer architectural components, which includes both encoding and decoding modules. Within the encoder, the Transformer block utilizes Overlap Patch Embeddings (OPEs) to extract feature representations and down-sample the input image. These extracted features are then fed into two critical components: the Efficient Self-Attention (ESA) and the Mix Feed-Forward Network (Mix-FFN). Here are their components and functionalities: the FFN indicates a Feed-Forward Network; H and W represent the height and width of the original image, respectively; the Transformer Block is the basic structure of the SegFormer backbone network.</p>
<p>To calculate the OPE, standard convolutional layers are employed. Following this, the 2D features are spatially reshaped into 1D representations and subsequently input into the ESA layer. The ESA layer plays a pivotal role in enhancing features through self-attentive computations. To address positional encoding, a 3 &#xd7; 3 convolution is thoughtfully introduced between the two linear layers of the FFN. This convolutional operation effectively fuses positional information into the network.</p>
<p>In the encoder, Linear Normalization (LN) sequentially follows linear layers, guaranteeing normalized representation of input features. Adopting Gaussian Error Linear Units imparts non-linear properties to the model as an activation function. Crucially, the encoder deploys numerous instances of Encoding Scale-Adaptive Modules (ESAs) and Mix Feature-wise FiLM Functional Units (MixFFNs), collectively increasing the depth of the network and enabling the discovery of subtle distinctions and semantic traits. Notably, individual self-attention calculations occur at each scale inside the ESA, differing from earlier network designs executing cross-scale self-attention computations following merger via CNNs. This independent computation style improves the quality and particularity self-attention mechanisms at respective scales, enhancing pattern recognition and relationship formation.</p>
<p>The present research implements the assorted Mix Transformer encoders (MiT) in the model&#x2019;s encoder, namely MiT-B0, MiT-B3, and MiT-B5. Classified as real-time SegFormer candidates, MiT-B0 and MiT-B3 excel in speed, while MiT-B5 adheres to the non-real-time standard favoring heightened accuracy. Outlined in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>, the principal hyperparameters of these models facilitate comparison. Experimentation entails trialing the three dissimilar SegFormer configurations to identify optimal solutions for detecting various strawberry disorders. Serving as an economical option, MiT-B0 possesses a diminished parameter count of approximately 3.4 million in the encoder and 0.4 million in the decoder. Superior performing MiT-B3 accumulates nearly 47.3 million parameters in total, representing a potent candidate amongst real-time alternatives. Further expanding upon its predecessors, MiT-B5 sports a grander configuration featuring 84.7 million parameters. The detailed comparison of the MiT encoders is shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Hyperparameters of MiT-B0, Mit-B3, and MiT-B5 architectures.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Parameters</th>
<th valign="top" align="left">MiT-B0</th>
<th valign="top" align="left">MiT-B3</th>
<th valign="top" align="left">MiT-B5</th>
</tr>
<tr>
<th valign="top" colspan="3" align="left">Overlapping Patch embedding</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Channel number, C</td>
<td valign="top" align="left">[32 64 160 256]*</td>
<td valign="top" align="left">[64 128 320 512]</td>
<td valign="top" align="left">[64 128 320 512]</td>
</tr>
<tr>
<td valign="top" align="left">Patch size, K</td>
<td valign="top" align="left">[7 3 3 3]*</td>
<td valign="top" align="left">[7 3 3 3]</td>
<td valign="top" align="left">[7 3 3 3]</td>
</tr>
<tr>
<td valign="top" align="left">Stride, S</td>
<td valign="top" align="left">[4 2 2 2]*</td>
<td valign="top" align="left">[4 2 2 2]</td>
<td valign="top" align="left">[4 2 2 2]</td>
</tr>
<tr>
<td valign="top" align="left">Padding, P</td>
<td valign="top" align="left">[3 1 1 1]*</td>
<td valign="top" align="left">[3 1 1 1]</td>
<td valign="top" align="left">[3 1 1 1]</td>
</tr>
<tr>
<th valign="top" colspan="4" align="left">Transformer encoder</th>
</tr>
<tr>
<td valign="top" align="left">Head number, N</td>
<td valign="top" align="left">[1 2 5 8]*</td>
<td valign="top" align="left">[1 2 5 8]</td>
<td valign="top" align="left">[1 2 5 8]</td>
</tr>
<tr>
<td valign="top" align="left">Encoder layers number, L</td>
<td valign="top" align="left">[2 2 2 2]*</td>
<td valign="top" align="left">[3 3 18 3]</td>
<td valign="top" align="left">[3 6 40 3]</td>
</tr>
<tr>
<td valign="top" align="left">Reduction ratio, R</td>
<td valign="top" align="left">[8 4 2 1]*</td>
<td valign="top" align="left">[8 4 2 1]</td>
<td valign="top" align="left">[8 4 2 1]</td>
</tr>
<tr>
<td valign="top" align="left">Expansion ratio of the feed-forward layer, E</td>
<td valign="top" align="left">[8 8 4 4]*</td>
<td valign="top" align="left">[8 8 4 4]</td>
<td valign="top" align="left">[8 8 4 4]</td>
</tr>
<tr>
<th valign="top" colspan="4" align="left">MLP decoder</th>
</tr>
<tr>
<td valign="top" align="left">channel dimension</td>
<td valign="top" align="left">256</td>
<td valign="top" align="left">768</td>
<td valign="top" align="left">768</td>
</tr>
<tr>
<th valign="top" colspan="4" align="left">Encoder and Decoder sizes</th>
</tr>
<tr>
<td valign="top" align="left">Encoder size, parameters</td>
<td valign="top" align="left">3.4</td>
<td valign="top" align="left">44.0</td>
<td valign="top" align="left">81.4</td>
</tr>
<tr>
<td valign="top" align="left">Decoder size, Parameters</td>
<td valign="top" align="left">0.4</td>
<td valign="top" align="left">3.3</td>
<td valign="top" align="left">3.3</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>*The values in the list correspond to the predefined settings for stage-1 to stage-4.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>&#x2713; The values in the list correspond to the predefined settings for stages from stage-1 to stage-4.</p>
<p>&#x2713; Input tensor: typically, the SegFormer model expects input tensors with a shape of (batch_size, 3, height, width).</p>
<p>&#x2713; Kernel size: convolutions within the stem layer use 3 &#xd7; 3 kernels.</p>
<p>&#x2713; Strides: the value is set to 1 for the majority of the layers in SegFormer.</p>
<p>&#x2713; Activation function: the SegFormer model frequently employs GELU (Gaussian Error Linear Unit).</p>
<p>&#x2713; The learning rate used is 0.00002.</p>
<p>MiT-B0 is the most compact model optimized for real-time applications, MiT-B3 is the larger model suitable for real-time tasks, and MiT-B5 is the largest model specifically designed for high-performance applications.</p>
</sec>
<sec id="s2_7">
<label>2.7</label>
<title>Architectural and mechanical variations between mix transformer encoders-decoders</title>
<p>The steps for Understanding SegFormer Variants and their operations can be summarized as follows:</p>
<p>1. Examine three mix transformer encoder options&#x2014;MiT-B0, MiT-B3, and MiT-B5&#x2014;each having different sizes, depths, and complexities impacting their capabilities (details are present in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>):</p>
<list list-type="bullet">
<list-item>
<p>MiT-B0: Smallest encoder with 32&#x2013;256 channel counts, 4&#x2013;2 patch resolution, 2 layers per stage, 1 head per layer, and fixed 8x MLP expansion ratios. Trades off feature learning and global context modeling for efficiency.</p>
</list-item>
<list-item>
<p>MiT-B3: Greater capacity with 64&#x2013;512 channel counts, 4&#x2013;2 patch resolution, 3&#x2013;18 layers per stage, 1&#x2013;2 heads per layer, and flexible 8x-4x MLP expansion ratios. Balances efficiency and performance.</p>
</list-item>
<list-item>
<p>MiT-B5: Prioritizes representational power over efficiency, having 3&#x2013;40 layers per stage, 1&#x2013;8 heads per layer, and larger width and depth for maximized global context modeling and rich feature learning.</p>
</list-item>
</list>
<p>2. Follow SegFormer decoder&#x2019;s four main steps:</p>
<list list-type="bullet">
<list-item>
<p>Obtain feature maps from the four encoder stages and pass them through an MLP layer to modify channel dimensions (256, 768, and 768 for MiT-B0, MiT-B3, and MiT-B5).</p>
</list-item>
<list-item>
<p>Up-sample or rescale features to a quarter of their original size and concatenate to build a feature map with 256 or 768 channels.</p>
</list-item>
<list-item>
<p>Combine consecutive features using an MLP layer.</p>
</list-item>
<list-item>
<p>Generate semantic segmentation predictions using another MLP layer and the merged feature.</p>
</list-item>
</list>
<p>Notable is that different encoder architectures lead to varying balances between model size, feature learning, and inference latency, causing distinctions in segmentation proficiency and efficiency.</p>
</sec>
<sec id="s2_8">
<label>2.8</label>
<title>Evaluation metrics</title>
<p>There are several common evaluation metrics used to assess the performance of segmentation models. These metrics help measure the accuracy and quality of the segmentation results.</p>
<sec id="s2_8_1">
<label>2.8.1</label>
<title>Pixel accuracy (accuracy): calculation of pixel-wise category counts</title>
<p>&#x2022; Let G represent the ground truth image with correct category labels, and P represent the predicted image with category labels. Additionally, let H and W denote the height and width of the labelled image, respectively. P<sub>ij</sub> signifies the count of pixels where the true label is category i, and they were predicted as category j. The calculation for P<sub>ij</sub> is as follows, <xref ref-type="disp-formula" rid="eq1">
<bold>Equation 1</bold>
</xref>:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>H</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>W</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
<mml:mo>.</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>&#x2022; where:</p>
<p>&#x2022; <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msub>
<mml:mtext>P</mml:mtext>
<mml:mrow>
<mml:mtext>ij</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the count of pixels, where the actual category label is <italic>i</italic> in the ground truth image, and they are predicted as category <italic>j</italic> in the predicted image.</p>
<p>&#x2022;  <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mtext>h</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mtext>H</mml:mtext>
</mml:msubsup>
<mml:mo>:</mml:mo>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> double summation. It iterates over the height (&#x210e;) of the label images.</p>
<p>&#x2022;  <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mtext>w</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mtext>W</mml:mtext>
</mml:msubsup>
<mml:mo>:</mml:mo>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> another double summation, iterating over the width (w) of the label images. The width of the image is denoted by W.</p>
<p>* <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> : this is the Kronecker delta function, which checks whether the pixel at coordinates (<italic>h</italic>,<italic>w</italic>) in the ground truth image (<italic>G</italic>) has the category label <italic>i</italic>. If the condition is true, <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> equals 1; otherwise, it equals 0.</p>
<p>* <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> : similarly, this Kronecker delta function checks whether the pixel at coordinates (<italic>h</italic>,<italic>w</italic>) in the predicted image (<italic>P</italic>) has the category label <italic>j</italic>. It equals 1 if the condition is true and 0 if it is false.</p>
<p>Pixels Accuracy calculates the fraction of correctly classified pixels in the entire image. It provides a measure of overall pixel-level accuracy. MPA and PA are expressed mathematically as follows, <xref ref-type="disp-formula" rid="eq2">
<bold>Equation 2</bold>
</xref>:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where:</p>
<p>
<inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the Pixel Accuracy for category <inline-formula>
<mml:math display="inline" id="im8">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>
<inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the count of pixels where both the actual category label and the predicted label are <inline-formula>
<mml:math display="inline" id="im10">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula>. In other words, it is the count of true positives for category <inline-formula>
<mml:math display="inline" id="im11">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula>. These are the pixels that were correctly predicted as category <inline-formula>
<mml:math display="inline" id="im12">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>
<inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> is a summation over <italic>j</italic> from 0 to <italic>k</italic>, where <italic>k</italic> represents the total category numbers (including background categories). It calculates the total count of pixels that are supposed to be category <italic>i</italic> in the ground truth image, regardless of whether they were predicted correctly or not.</p>
</sec>
<sec id="s2_8_2">
<label>2.8.2</label>
<title>Mean pixel accuracy</title>
<p>Mean Pixel Accuracy, sometimes called Mean Accuracy, calculates the average accuracy of each class. It takes into account the class-wise accuracy and computes the mean, <xref ref-type="disp-formula" rid="eq3">
<bold>Equation 3</bold>
</xref>.</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>A</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where:</p>
<p>
<inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the pixel accuracy of the i-th class, k refers to the total number of classes, and the &#x201c;+1&#x201d; accounts for the background class. Essentially, MPA averages the individual class accuracies, providing a holistic measure of segmentation performance considering all classes present in the dataset.</p>
</sec>
<sec id="s2_8_3">
<label>2.8.3</label>
<title>Mean Intersection over Union (Jaccard Index):</title>
<p>Mean IoU, or Mean Intersection over Union, quantifies the extent of overlap between the predicted segmentation masks and the corresponding ground truth masks. In other words, it represents the ratio of the intersection for class <italic>i</italic> to the union for class <italic>i</italic>, <xref ref-type="disp-formula" rid="eq2">
<bold>Equations 4</bold>
</xref>&#x2013;<xref ref-type="disp-formula" rid="eq6">
<bold>6</bold>
</xref>. Mean IoU is a valuable metric for evaluating the accuracy and precision of semantic image segmentation models (<xref ref-type="disp-formula" rid="eq7">
<bold>Equation 7</bold>
</xref>), where the intersection for class <italic>i</italic> is, <xref ref-type="disp-formula" rid="eq4">
<bold>Equation 4</bold>
</xref>:</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2229;</mml:mo>
</mml:mstyle>
<mml:mo>&#xa0;</mml:mo>
</mml:msup>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Union for class <italic>i</italic> is given by, <xref ref-type="disp-formula" rid="eq5">
<bold>Equation 5</bold>
</xref>:</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x222a;</mml:mo>
</mml:mstyle>
<mml:mo>&#xa0;</mml:mo>
</mml:msup>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>
<inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the predicted and ground truth masks for class <italic>i</italic>, respectively.</p>
<p>The mathematical expressions for IoU is <xref ref-type="disp-formula" rid="eq6">
<bold>Equation 6</bold>
</xref>
</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mo>+</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im17">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula>: This part sums up all the pixels that should be category <italic>i</italic> in the ground truth image, whether they were predicted correctly or not. It includes true positives and false negatives for category <italic>i</italic>. <inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula>: Similarly, this part represents the sum of the count of pixels that were predicted as category <italic>j</italic> and are supposed to be category <italic>i</italic> in the ground truth image. It includes true positives and false positives for category <italic>i.</italic>
</p>
<p>By subtracting <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mo>+</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> removes the overlap between the true positives (common pixels between predicted and ground truth). This adjustment ensures that the IoU measures the proportion of the correctly predicted pixels relative to the total area that should be category <italic>i</italic> in the ground truth, excluding the overlap.</p>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In the context of the segmentation task, k denotes the highest valid class label, while k+1 correspond to the overall sum of classes.</p>
</sec>
<sec id="s2_8_4">
<label>2.8.4</label>
<title>FLOPs</title>
<p>Floating-Point Operations per Second (FLOPS) involves determining the number of floating-point operations a computer or a processor can perform in one second. FLOPS is a commonly used metric to measure the computational performance of hardware, such as CPUs, GPUs, or accelerators. GFLOPs (Giga-Floating-Point Operations per Second) represent one billion floating-point operations per second.</p>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and discussion</title>
<p>The training and testing setup in this study involved specific hardware and software configurations. The computer used for this research is equipped with 10th generation Intel (R) Core (TM) i7&#x2013;10870H processor, featuring 16 threads, 8 cores, a base clock speed of 2.21GHz, and a turbo speed of 5GHz. It is equipped with 16MB cache memory and supports a maximum memory size of 128GB (DDR4&#x2013;2933). The graphics processing unit employed is the NVIDIA GeForce RTX3060, boasting 3840 CUDA cores and 6 GB of video memory. The operating system utilized is Windows 10, and the software stack includes PyTorch Lightning version 1.9.5, Python version 3.8, and CUDA version 11. PyTorch Lightning serves as a lightweight wrapper for PyTorch, streamlining the process of training and evaluating PyTorch models.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Segmentation visualization for various scenarios</title>
<p>To assess the efficacy of SegFormer, a comprehensive evaluation of the model&#x2019;s performance was conducted on the testing set. The testing set was systematically divided into distinct subsets, each catering to different disease perspectives. These divisions were primarily based on the nature of the disease, the clarity of disease manifestations, and the density of disease regions.</p>
<p>To evaluate the model&#x2019;s ability to handle various disease types, three different disease perspectives for each disease were selected as research objects. The results of semantic segmentation of these diverse disease types are pictured at <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. Moreover, for assessing the model&#x2019;s performance in distinguishing between clear and blurry disease manifestations, samples representing both scenarios were selected, and their segmentation results were visually represented. Additionally, the study investigated the model&#x2019;s competence in handling the sparseness and density of disease manifestations. Two samples were chosen to represent each scenario, and the segmentation results are visually presented in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. The visualized results demonstrate SegFormer&#x2019;s remarkable ability to accurately identify and segment various disease types, consistent with manually labeled and segmented data, validating its effectiveness in semantic segmentation. <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> presents a comparative analysis of various Mix Transformer encoder models in diagnosing six prevalent strawberry diseases. The tested models include MiT-B0, MiT-B3, and MiT-B5, evaluated on angular leaf spot, anthracnose fruit rot, blossom blight, gray mold, leaf spot, powdery mildew on fruit, and powdery mildew on leaves. Each entry contains the corresponding test loss, test mean pixel accuracy (MPA), test mean Intersection over Union (mIoU), computation complexity (GFLOPs), and the total estimated model parameter size in megabytes (MB). This detailed comparison helps assess each model&#x2019;s performance, computational efficiency, and model size to guide developers and researchers towards an informed decision when selecting an appropriate model for specific strawberry disease detection tasks.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Visual representation of segmentation across various instances of strawberry diseases.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g003.tif"/>
</fig>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Comparative performance analysis of various mix transformer encoder models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Encoder type</th>
<th valign="top" align="center">Test-loss</th>
<th valign="top" align="center">Test-MPA</th>
<th valign="top" align="center">Test-mIoU</th>
<th valign="top" align="center">GFLOPs</th>
<th valign="top" align="center">Total estimated model params size (MB)</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="top" colspan="6" align="center">1. Angular leaf spot disease</th>
</tr>
<tr>
<td valign="top" align="left">MiT-Bo</td>
<td valign="top" align="center">0.0669</td>
<td valign="top" align="left">0.95748</td>
<td valign="top" align="left">0.9238</td>
<td valign="top" align="left">0.846</td>
<td valign="top" align="left">14.859</td>
</tr>
<tr>
<th valign="top" colspan="6" align="center">2. Anthracnose fruit rot</th>
</tr>
<tr>
<td valign="top" align="left">MiT-B0</td>
<td valign="top" align="center">0.1586</td>
<td valign="top" align="left">0.9321</td>
<td valign="top" align="left">0.8784</td>
<td valign="top" align="left">1.692</td>
<td valign="top" align="left">14.859</td>
</tr>
<tr>
<th valign="top" colspan="6" align="center">3. Blossom blight</th>
</tr>
<tr>
<td valign="top" align="left">MiT-B0</td>
<td valign="top" align="center">0.0663</td>
<td valign="top" align="left">0.90891</td>
<td valign="top" align="left">0.8784</td>
<td valign="top" align="left">2.541</td>
<td valign="top" align="left">14.859</td>
</tr>
<tr>
<th valign="top" colspan="12" align="center">4. Gray mold</th>
</tr>
<tr>
<td valign="top" align="left">MiT-B0</td>
<td valign="top" align="center">0.191</td>
<td valign="top" align="left">0.87751</td>
<td valign="top" align="left">0.7844</td>
<td valign="top" align="left">0.846</td>
<td valign="top" align="left">14.859</td>
</tr>
<tr>
<td valign="top" align="left">MiT-B3</td>
<td valign="top" align="center">0.095</td>
<td valign="top" align="left">0.9134</td>
<td valign="top" align="left">0.8567</td>
<td valign="top" align="left">26.76</td>
<td valign="top" align="left">188.896</td>
</tr>
<tr>
<td valign="top" align="left">MiT-B5</td>
<td valign="top" align="center">0.1299</td>
<td valign="top" align="left">0.895</td>
<td valign="top" align="left">0.835</td>
<td valign="top" align="left">37.41</td>
<td valign="top" align="left">338.380</td>
</tr>
<tr>
<th valign="top" colspan="6" align="center">5. Leaf spot</th>
</tr>
<tr>
<td valign="top" align="left">MiT-B0</td>
<td valign="top" align="center">0.2023</td>
<td valign="top" align="left">0.9456</td>
<td valign="top" align="left">0.897</td>
<td valign="top" align="left">2.12</td>
<td valign="top" align="left">14.859</td>
</tr>
<tr>
<th valign="top" colspan="6" align="center">6. Powdery mildew fruit</th>
</tr>
<tr>
<td valign="top" align="left">MiT-Bo</td>
<td valign="top" align="center">0.319</td>
<td valign="top" align="left">0.9296</td>
<td valign="top" align="left">0.8271</td>
<td valign="top" align="left">2.96</td>
<td valign="top" align="left">14.859</td>
</tr>
<tr>
<th valign="top" colspan="6" align="center">7. Powdery mildew leaf</th>
</tr>
<tr>
<td valign="top" align="left">MiT-Bo</td>
<td valign="top" align="center">0.1726</td>
<td valign="top" align="left">0.691</td>
<td valign="top" align="left">0.6352</td>
<td valign="top" align="left">1.27</td>
<td valign="top" align="left">14.859</td>
</tr>
<tr>
<td valign="top" align="left">MiT-B3</td>
<td valign="top" align="center">0.1677</td>
<td valign="top" align="left">0.7753</td>
<td valign="top" align="left">0.6823</td>
<td valign="top" align="left">13.38</td>
<td valign="top" align="left">188.896</td>
</tr>
<tr>
<td valign="top" align="left">MiT-B5</td>
<td valign="top" align="center">0.1731</td>
<td valign="top" align="left">0.762</td>
<td valign="top" align="left">0.680</td>
<td valign="top" align="left">18.70</td>
<td valign="top" align="left">338.380</td>
</tr>
<tr>
<td valign="top" align="left">MiT-B3</td>
<td valign="top" align="center">0.0775</td>
<td valign="top" align="left">0.91293</td>
<td valign="top" align="left">0.8732</td>
<td valign="top" align="left">26.77</td>
<td valign="top" align="left">188.902</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Based on the visual results for segmentation of various strawberry diseases <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref> represents an explanation of the key observations:</p>
<list list-type="bullet">
<list-item>
<p>For Angular Leaf Spot disease, MiT-B3 and MiT-B5 perform well in identifying multiple spots on the leaves. MiT-B0 struggles with smaller spots. MiT-B5 delineates boundaries most cleanly.</p>
</list-item>
<list-item>
<p>On Anthracnose Fruit Rot, all three encoders (MiT-B0, MiT-B3, MiT-B5) achieve accurate localization and segmentation of the disease regions. MiT-B5 produces the most precise segmentation boundaries.</p>
</list-item>
<list-item>
<p>For Gray Mold, MiT-B3 and MiT-B5 accurately capture the diffuse disease regions, while MiT-B0 misses some portions. MiT-B3 provides finer segmentation.</p>
</list-item>
<list-item>
<p>On Leaf Spot disease, MiT-B3 and MiT-B5 precisely identify and segment the multiple disease spots. MiT-B0 can detect some smaller spots. MiT-B5 offers the highest precision.</p>
</list-item>
<list-item>
<p>For Powdery Mildew on Leaves, MiT-B5 clearly outperforms MiT-B0 and MiT-B3 in detecting the scattered powdery patterns. Its segmentation aligns closely with ground truth.</p>
</list-item>
<list-item>
<p>On Powdery Mildew on Fruits, all encoders of MiT-B0, MiT-B3 and MiT-B5 achieve good localization. MiT-B5 provides the most accurate delineation.</p>
</list-item>
<list-item>
<p>Finally, for Blossom Blight, all encoders effectively identify the affected flower regions.</p>
</list-item>
</list>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Visual representation of strawberry diseases segmentation process.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g004.tif"/>
</fig>
<p>As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, MiT-B3 and MiT-B5 consistently outperform MiT-B0 across disease types, with MiT-B5 achieving the most precise segmentation in general. The results highlight the importance of selecting appropriate encoders matched to disease characteristics and use cases.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Boosting model training performance through augmentation techniques</title>
<p>The comparative results validate that data augmentation provided notable benefits for model training using the MiT-B3 encoder on the powdery mildew leaf disease dataset. Specifically, training with augmented data led to faster convergence evidenced by lower losses, reduced overfitting indicated by smaller gaps between training and validation metrics, more stable validation performance, and higher accuracy. For instance, by epoch 39 the training mean IoU reached 0.9 with augmentation versus 0.86 without. Meanwhile, the validation mean IoU improved gradually to 0.69 with augmentation compared to more fluctuation and ending at 0.68 without. Similarly, validation mean accuracy climbed to 0.76 with augmented data versus plateauing at 0.74 without. The consistent improvements in key metrics like loss, IoU, and accuracy demonstrate that introducing expanded diversity through augmentation techniques helped the model generalize better and boosted its capabilities, as shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Optimizing Model Training Performance with Augmentation Methods.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g005.tif"/>
</fig>
<p>The comparisons clearly validate that augmentation enabled superior training and segmentation performance, allowing the MiT-B3 encoder learn faster and achieve higher metrics on the powdery mildew leaf disease dataset.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Unleashing model potential: early stopping and checkpointing for precise strawberry disease detection</title>
<p>This section demonstrates the transformative power of early stopping and model checkpointing in optimizing a deep learning model for strawberry disease detection, as shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>. By strategically employing these techniques, impressive results were achieved:</p>
<list list-type="bullet">
<list-item>
<p>Training and validation mIoU reaching 0.96 and 0.93, respectively, after 175 epochs.</p>
</list-item>
<list-item>
<p>Remarkably low training and validation losses of 0.042 and 0.015 - a testament to the combined effectiveness of these methodologies.</p>
</list-item>
</list>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Graphs of Training and Validation Sets, along with Performance Metrics for SegFormer Evaluation on angular leaf spot and anthracnose fruit rot diseases using MiT-B0 Mix Transformer Encoders.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g006.tif"/>
</fig>
<p>
<bold>Early stopping</bold>, a vigilant guardian, constantly monitored validation loss during training. When progress plateaued, it intervened, preventing overfitting and saving the model from memorizing training data instead of learning generalizable features.</p>
<p>
<bold>Model checkpointing</bold> acted as a reliable safety net, preserving the best performing model versions throughout training. This invaluable technique ensured the progress due to potential training hiccups.</p>
<p>Together, these techniques fostered a harmonious balance between model complexity and generalization. The model effectively generalized to the unseen data, accurately identifying various strawberry diseases (Angular leaf spot, Anthracnose fruit rot, Blossom blight) under natural conditions.</p>
<p>The consistent performance across different diseases underscores the approach robustness. In synergy with innovative deep learning techniques, meticulous data preparation, and effective monitoring, early stopping and model checkpointing pave the way for real-world applications demanding high precision, like disease detection in agriculture.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Dissecting blossom blight detection: MiT-B3 outshines MiT-B0 in SegFormer models</title>
<p>Understanding blossom blight in strawberries through deep learning is crucial for effective disease management. This section compares two prominent architectures, MiT-B0 and MiT-B3, within SegFormer models to see which encoder excels in detection, as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>. The results clarify key factors for choosing the right model for tackling specific diseases.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Graphs of Training and Validation Sets, along with Performance Metrics for SegFormer Evaluation on blossom blight disease using MiT-B0 and MiT-B3 Mix Transformer Encoders.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g007.tif"/>
</fig>
<p>MiT-B0: While showing potential, consistency remains a hurdle. During training, its mean IoU (a measure of segmentation accuracy) fluctuates significantly. This suggests difficulty adapting to the disease&#x2019;s diverse manifestations. However, the gradual rise in validation mean IoU indicates promising generalization to unseen data. Further investigation is needed to unlock MiT-B0&#x2019;s full potential for consistent accuracy.</p>
<p>MiT-B3: This architecture outperforms in both rapid adaptation and stability. Training mean IoU experiences a remarkable jump from 0.34 to 0.7 within a single epoch, demonstrating efficient learning of disease features. Even after initial fluctuations, validation mean IoU stabilizes and steadily climbs, reaching 0.85. This signifies successful adaptation and consistent accuracy on unseen data, making MiT-B3 ideal for real-world scenarios. The choice of Mix Transformer encoder significantly impacts performance. While MiT-B0 shows potential, MiT-B3 dominates when it comes to swift adaptation and reliable detection. Its rapid learning and strong validation performance make it the clear winner for applications demanding fast adaptation and real-world disease detection.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Unveiling the gray mold buster: MiT-B3 reigns supreme in SegFormer models</title>
<p>Combating gray mold in strawberries requires effective detection tools. This section investigates three Mix Transformer encoders within SegFormer models - MiT-B0, MiT-B3, and MiT-B5 - to find the champion disease detective, as shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. The results hold valuable insights for both disease detection and model selection.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Training, Validation Sets, and Performance Metrics for SegFormer-Based Model Evaluation on Gray Mold Strawberry Disease using MiT-B0, MiT-B3, and MiT-B5 Mix Transformer Encoders.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g008.tif"/>
</fig>
<sec id="s3_5_1">
<label>3.5.1</label>
<title>MiT-B0</title>
<p>A solid contender, but with room for improvement. While converging well with similar training and validation losses (0.12 and 0.18), a lower validation mIoU (0.79) compared to training (0.87) implies possible overfitting. However, consistent accuracy across training and validation (0.91 vs. 0.87) shows promise.</p>
</sec>
<sec id="s3_5_2">
<label>3.5.2</label>
<title>MiT-B3</title>
<p>Exceptional generalization and fitting are evident in its low training (0.045) and validation losses (0.19). High mIoU values for both training and validation (indicating ability to capture disease details) solidify its lead. Even on unseen test data, it scores a strong mIoU of 0.8567. Impressively high accuracy, especially on the test set, confirms its reliable gray mold identification under diverse conditions.</p>
</sec>
<sec id="s3_5_3">
<label>3.5.3</label>
<title>MiT-B5</title>
<p>Training loss exhibits some instability, potentially impacting performance. While training mIoU is high (0.909), validation and test mIoU are slightly lower (0.82 and 0.835, respectively). This encoder demonstrates respectable scores, although lacks the consistency of MiT-B3. Its high training accuracy (0.95) is mirrored in validation and test sets (0.89 and 0.895), indicating potential but requiring further optimization.</p>
</sec>
<sec id="s3_5_4">
<label>3.5.4</label>
<title>Key takeaways</title>
<p>&#x2022; Encoder choice matters: MiT-B3 consistently outperforms the others in mIoU, accuracy, and convergence.</p>
<p>&#x2022; MiT-B0 is well-balanced but susceptible to overfitting.</p>
<p>&#x2022; MiT-B3 is the champion with exceptional performance and generalization.</p>
<p>&#x2022; MiT-B5 shows potential, but requires refinement for stability.</p>
<p>The findings: For tackling gray mold, MiT-B3 proves to be the most effective encoder. Its exceptional performance and impressive generalization power make it an invaluable tool for accurate disease detection in real-world scenarios. This study underscores the importance of matching the encoder to the specific disease for optimal results, paving the way for improved strawberry protection and enhanced agricultural practices.</p>
</sec>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Detecting leaf spot and powdery mildew with SegFormer models</title>
<p>This section explores the ability of MiT-B0, a Mix Transformer encoder, within SegFormer models to detect two distinct strawberry diseases: leaf spot and powdery mildew fruit disease (<xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>).</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Graphs of Training and Validation Sets, along with Performance Metrics for SegFormer Evaluation on leaf spots and powdery mildew fruit diseases using MiT-B0 and MiT-B3 Mix Transformer Encoders.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g009.tif"/>
</fig>
<sec id="s3_6_1">
<label>3.6.1</label>
<title>Leaf spot</title>
<p>&#x2022; <bold>Training loss:</bold> Experienced two peaks, suggesting temporary difficulty due to disease complexity. However, it eventually reached a low value of 0.05.</p>
<p>&#x2022; <bold>Validation loss:</bold> Steadily decreased and plateaued at 0.05, indicating consistent performance on unseen data.</p>
<p>&#x2022; <bold>Mean IoU:</bold> Training mIoU reached a high of 0.98, while validation mIoU stabilized at 0.88, demonstrating effective learning and reliable detection.</p>
<p>&#x2022; <bold>Accuracy:</bold> Both training and validation accuracy were high (0.98 and 0.93 respectively), confirming accurate disease identification.</p>
<p>
<bold>Powdery Mildew Fruit Disease:</bold>
</p>
<p>&#x2022; <bold>Training loss:</bold> Fluctuated within 0.12 but peaked significantly at epoch 56. Ultimately, it decreased to 0.1.</p>
<p>&#x2022; <bold>Validation loss:</bold> Showed a steadier decrease, plateauing at 0.22 and achieving a test loss of 0.319.</p>
<p>&#x2022; <bold>Mean IoU:</bold> Training mIoU was high at 0.92, while validation mIoU was slightly lower at 0.81, indicating efficient learning but less accurate validation performance.</p>
<p>&#x2022; <bold>Accuracy:</bold> Training and validation accuracy remained strong (0.98 and 0.88 respectively), with a test accuracy of 0.9296.</p>
</sec>
<sec id="s3_6_2">
<label>3.6.2</label>
<title>Key takeaways</title>
<p>&#x2022; <bold>Adaptability:</bold> The model successfully tackled both diseases, highlighting its potential for diverse applications.</p>
<p>&#x2022; <bold>Learning Power:</bold> Consistent validation performance signifies effective learning despite training loss fluctuations.</p>
<p>&#x2022; <bold>Trade-offs:</bold> Higher complexity (leaf spot) might cause temporary training challenges, but the model adapts and stabilizes.</p>
<p>MiT-B0 proves adaptable in detecting different strawberry diseases. While training loss may fluctuate with disease complexity, the model demonstrates its ability to learn, generalize, and achieve reliable detection, making it a promising tool for precision agriculture.</p>
</sec>
</sec>
<sec id="s3_7">
<label>3.7</label>
<title>Decoding powdery mildew: finding the best AI detector with SegFormer models</title>
<p>This section delves into the performance of SegFormer models equipped with three Mix Transformer encoders (MiT-B0, MiT-B3, and MiT-B5) for detecting powdery mildew on leaves, as shown in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>. Each model reveals unique behaviors and outcomes, offering valuable insights for choosing the right tool for the job.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Training, Validation Sets, and Performance Metrics for SegFormer-Based Model Evaluation on powdery mildew leaf diseases using MiT-B0, MiT-B3 and MiT-B5 Mix Transformer Encoders.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1352935-g010.tif"/>
</fig>
<sec id="s3_7_1">
<label>3.7.1</label>
<title>MiT-B0: efficient learner, room for growth</title>
<p>&#x2022; Initial training demonstrates difficulties with loss fluctuations, revealing adaptation challenges.</p>
<p>&#x2022; Validation loss stays stable, providing good generalization for unseen data.</p>
<p>&#x2022; The encoder achieves a respectable mean IoU of 0.89 and accuracy of 0.92.</p>
<p>&#x2022; Its low computational cost (1.269 GFLOPs) makes it a budget-friendly option.</p>
</sec>
<sec id="s3_7_2">
<label>3.7.2</label>
<title>MiT-B3: speedy adapter, ideal for new disease variants</title>
<p>&#x2022; It quickly adapts during training, boosting mean IoU to 0.9 and accuracy to 0.94.</p>
<p>&#x2022; Validation performance also thrives, due to early stopping for efficient training in 60 epochs.</p>
<p>&#x2022; It is ideal for scenarios demanding swift adaptation to novel disease variants.</p>
</sec>
<sec id="s3_7_3">
<label>3.7.3</label>
<title>MiT-B5: fast learner, high accuracy (but pricey)</title>
<p>It converges rapidly with early stopping, reaching a high mean IoU of 0.9 and accuracy of 0.93 on both training and validation.</p>
<p>&#x2022; It takes fewer epochs but demands more computational power (18.70 GFLOPs).</p>
<p>&#x2022; It is perfect for situations where accuracy is paramount and resources are plentiful.</p>
</sec>
<sec id="s3_7_4">
<label>3.7.4</label>
<title>Matching tool to task: a balancing act</title>
<p>&#x2022; Encoder choice significantly impacts performance and adaptation speed.</p>
<p>&#x2022; Complex diseases like powdery mildew benefit from MiT-B3&#x2019;s quick adaptation.</p>
<p>&#x2022; For efficiency-driven applications, MiT-B0 might be the best option.</p>
<p>Selecting the optimal Mix Transformer encoder specific disease, dataset, and resource constraints should be considered. Understanding the trade-off between computation, training time, and accuracy is crucial for real-world success. This detailed analysis empowers informed decision-making for disease detection tasks, ensuring the best AI tools application.</p>
</sec>
</sec>
<sec id="s3_8">
<label>3.8</label>
<title>Comparative analysis with other segmentation models</title>
<p>To assess the effectiveness of the mix transformer encoders under study, several major segmentation models were trained and fine-tuned using the training and validation sets. <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref> below provides a comparative analysis of popular segmentation models and the proposed SegFormer variants. The comparison covers essential metrics like Total Parameters (M), mean Intersection over Union (mIoU), Mean Pixel Accuracy (MPA), and Flops (G). This comprehensive evaluation assists researchers and practitioners in determining the optimal model for their specific computer vision tasks, considering the trade-offs between model complexity, computational cost, and segmentation performance. Presented here are widely used models such as U-Net, DeepLabV3+, SegNet, and SETR, together with the newly proposed SegFormer configurations equipped with MiT-B0, MiT-B3, and MiT-B5 encoders.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Comparative analysis with other segmentation models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Encode/Backbone</th>
<th valign="top" align="center">Total Params (M)</th>
<th valign="top" align="center">mIoU(%)</th>
<th valign="top" align="center">MPA (%)</th>
<th valign="top" align="center">FLOPs(G)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">U-Net<break/>
<xref ref-type="bibr" rid="B25">Ronneberger et&#xa0;al., 2015</xref>
</td>
<td valign="top" align="center">MobileNetV2</td>
<td valign="top" align="center">24.33</td>
<td valign="top" align="center">63.9</td>
<td valign="top" align="center">69.3</td>
<td valign="top" align="center">45.23</td>
</tr>
<tr>
<td valign="top" align="center">Vgg16</td>
<td valign="top" align="center">24.89</td>
<td valign="top" align="center">37.50</td>
<td valign="top" align="center">46.04</td>
<td valign="top" align="center">451.77</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">DeepLabV3+<break/>
<xref ref-type="bibr" rid="B6">Chen et&#xa0;al., 2018</xref>
</td>
<td valign="top" align="center">DensNet-121</td>
<td valign="top" align="center">51.32</td>
<td valign="top" align="center">66.2</td>
<td valign="top" align="center">73.8</td>
<td valign="top" align="center">182.36</td>
</tr>
<tr>
<td valign="top" align="center">Xception</td>
<td valign="top" align="center">54.71</td>
<td valign="top" align="center">32.10</td>
<td valign="top" align="center">51.25</td>
<td valign="top" align="center">166.87</td>
</tr>
<tr>
<td valign="top" align="left">SegNet<break/>
<xref ref-type="bibr" rid="B2">Badrinarayanan et&#xa0;al., 2017</xref>
</td>
<td valign="top" align="center">VGG16</td>
<td valign="top" align="center">29.46</td>
<td valign="top" align="center">57.0</td>
<td valign="top" align="center">62.7</td>
<td valign="top" align="center">284.10</td>
</tr>
<tr>
<td valign="top" align="left">SETR<break/>
<xref ref-type="bibr" rid="B44">Zheng et&#xa0;al., 2021</xref>
</td>
<td valign="top" align="center">ViT-Large</td>
<td valign="top" align="center">318.3</td>
<td valign="top" align="center">69.5</td>
<td valign="top" align="center">75.3</td>
<td valign="top" align="center">720.68</td>
</tr>
<tr>
<td valign="top" align="left">HRNet<break/>
<xref ref-type="bibr" rid="B30">Wang et&#xa0;al., 2020</xref>
</td>
<td valign="top" align="center">NA</td>
<td valign="top" align="center">29.54</td>
<td valign="top" align="center">31.02</td>
<td valign="top" align="center">53.75</td>
<td valign="top" align="center">79.96</td>
</tr>
<tr>
<td valign="top" align="left">ECA-SegFormer<break/>
<xref ref-type="bibr" rid="B36">Yang et&#xa0;al., 2023</xref>
</td>
<td valign="top" align="center">NA</td>
<td valign="top" align="center">4.04</td>
<td valign="top" align="center">38.03</td>
<td valign="top" align="center">60.86</td>
<td valign="top" align="center">10.64</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">PSPNet<break/>
<xref ref-type="bibr" rid="B41">Zhao et&#xa0;al., 2017</xref>
</td>
<td valign="top" align="center">MobileNetV2</td>
<td valign="top" align="center">15.4</td>
<td valign="top" align="center">61.1</td>
<td valign="top" align="center">68.2</td>
<td valign="top" align="center">84.9</td>
</tr>
<tr>
<td valign="top" align="center">Resnet50</td>
<td valign="top" align="center">46.71</td>
<td valign="top" align="center">28.51</td>
<td valign="top" align="center">42.03</td>
<td valign="top" align="center">118.43</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="left">Proposed SegFormer</td>
<td valign="top" align="center">MiT-B0</td>
<td valign="top" align="center">
<bold>3.7</bold>
</td>
<td valign="top" align="center">
<bold>65.33</bold>
</td>
<td valign="top" align="center">
<bold>71.54</bold>
</td>
<td valign="top" align="center">
<bold>1.27</bold>
</td>
</tr>
<tr>
<td valign="top" align="center">MiT-B3</td>
<td valign="top" align="center">
<bold>47.2</bold>
</td>
<td valign="top" align="center">
<bold>65.31</bold>
</td>
<td valign="top" align="center">
<bold>75.73</bold>
</td>
<td valign="top" align="center">
<bold>13.38</bold>
</td>
</tr>
<tr>
<td valign="top" align="center">MiT-B5</td>
<td valign="top" align="center">
<bold>84.7</bold>
</td>
<td valign="top" align="center">
<bold>67.78</bold>
</td>
<td valign="top" align="center">
<bold>76.89</bold>
</td>
<td valign="top" align="center">
<bold>18.70</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values highlight the model that performed best or had the maximum value for each evaluation metric. NA, Not Available. </p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>, starting with model complexity - SegFormer demonstrates highly competitive performance with significantly lower model parameters compared to such state-of-the-art models like SETR and DeepLabV3+. For instance, even the largest MiT-B5 variant of SegFormer has 85% lesser parameters than SETR. This indicates SegFormer can match or exceed the capabilities of much larger models with far fewer parameters.</p>
<p>In terms of accuracy, measured by mean IoU and mean pixel accuracy, SegFormer consistently achieves outstanding results, outperforming classic models like U-Net, SegNet, and PSPNet. The MiT-B5 variant in particular exceeds DeepLabV3+ and comes close to SETR, which is remarkable given SETR&#x2019;s massive size. This shows the representation power and generalization ability of SegFormer.</p>
<p>Finally, regarding efficiency, SegFormer requires significantly lower Floating Point Operations (FLOPs) compared to prior models like SETR and PSPNet. The smallest MiT-B0 SegFormer operates at less than 2 GFLOPs, enabling real-time inference on edge devices. Even MiT-B5 operates at nearly 4x lower FLOPs than SETR.</p>
<p>SegFormer establishes a new state-of-the-art in semantic segmentation across all key aspects - lower model complexity, greater accuracy, and higher efficiency. For strawberry disease segmentation, SegFormer provides the right balance of performance, accuracy, and efficiency as evidenced by the comparative analysis. This makes it the ideal choice to deploy in real-world agriculture applications.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Limitations and challenges</title>
<p>Despite the promising results and contributions of this research, there are certain limitations that require consideration. Addressing these constraints could give prospects for future exploration and improvements in the field of strawberry disease detection.</p>
<p>&#x2022; <bold>Limited scope of dataset:</bold> Although the current study uses an adequately sized and diversified dataset, incorporating additional sources and increasing the volume of data could lead to more robust and generalizable models. Exploring multisource data fusion, combining images taken under different lighting conditions, geographical locations, and camera angles could further strengthen the model&#x2019;s performance.</p>
<p>&#x2022; <bold>Impact of weather conditions:</bold> Environmental factors, such as temperature, humidity, and sunlight exposure play a significant role in the appearance of strawberry diseases. Investigating the influence of these variables on model performance and accounting for dynamic weather conditions could result in more accurate and adaptable models.</p>
<p>&#x2022; <bold>Integration with Internet of Things (IoT) platforms:</bold> Connecting the strawberry disease detection system with IoT devices, such as sensors and cameras installed in greenhouses, would facilitate real-time monitoring and decision-making. Further research could explore integrating the proposed model with IoT infrastructure for seamless implementation in agricultural settings.</p>
<p>&#x2022; <bold>Human-computer interaction for user feedback:</bold> Developing intuitive user interfaces that allow users to provide feedback on model outputs could create opportunities for continuous learning and model improvement. Iteratively updating the model based on expert user inputs could result in more accurate and trustworthy systems.</p>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>This study has demonstrated the successful application of the SegFormer segmentation model for precise semantic segmentation of strawberry diseases, striving to enhance disease detection accuracy under natural acquisition conditions. The analysis of three distinct Mix Transformer encoders&#x2014;MiT-B0, MiT-B3, and MiT-B5&#x2014;has revealed their unique behaviors and benefits, catering to varying needs in disease detection applications. Adopting the novel SAM integrated into the Roboflow annotation tool enabled efficient annotation and preparation of a strawberry disease dataset, while rigorous augmentation techniques ensures the dataset&#x2019;s quality and diversity. Balanced partitioning of the dataset into training, validation, and test subsets guarantees fair evaluation and optimized model performance. Implementing PyTorch Lightning, a potent deep learning framework, resulted in a finely tuned semantic segmentation model displaying impressive training and validation mIoU scores of 0.96 and 0.93, respectively. Moreover, SegFormer emerged victorious in comparative tests against other renowned segmentation models, outshining classical competitors such as U-Net, SegNet, and PSPNet in mean IoU and mean pixel accuracy. Crucially, SegFormer demonstrated its prowess operating with significantly fewer parameters and lower FLOPs than cutting-edge alternatives like SETR and DeepLabV3+, cementing its status as a compelling solution for practical agriculture applications. These findings hold great promise for the future of disease detection systems, suggesting that carefully chosen encoders paired with advanced models can deliver substantial improvements in accuracy, efficiency, and adaptability. As a consequence, researchers now have access to actionable insights for selecting the most suitable encoder in disease detection applications, propelling the field forward for further investigation. Future work in this domain includes multi-modal input integration, transfer learning across crops, online learning systems, scalable solutions, custom hardware development, benchmarking and standardization initiatives, open research platforms, and codebase creations. Ultimately, the goal is to establish robust, accessible, and adaptable AI technologies that empower stakeholders in the agricultural sector to make informed decisions and implement timely actions for sustainable food production.</p>
</sec>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <uri xlink:href="https://www.kaggle.com/datasets/usmanafzaal/strawberry-disease-detection-dataset">https://www.kaggle.com/datasets/usmanafzaal/strawberry-disease-detection-dataset</uri>.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>WE: Conceptualization, Data curation, Methodology, Writing &#x2013; original draft, Formal analysis, Investigation, Project administration, Software, Supervision, Validation, Visualization, Writing &#x2013; review &amp; editing. JG: Writing &#x2013; review &amp; editing, Data curation, Investigation, Methodology, Software, Validation, Writing &#x2013; original draft. MS: Conceptualization, Data curation, Investigation, Methodology, Software, Writing &#x2013; original draft. SA: Conceptualization, Formal analysis, Project administration, Resources, Validation, Visualization, Writing &#x2013; review &amp; editing. FM: Conceptualization, Formal analysis, Project administration, Resources, Visualization, Writing &#x2013; review &amp; editing. TE-M: Writing &#x2013; review &amp; editing, Conceptualization, Formal analysis, Project administration, Validation, Visualization. DB: Funding acquisition, Writing &#x2013; review &amp; editing, Investigation, Methodology, Software. DM: Data curation, Methodology, Writing &#x2013; review &amp; editing. MM: Investigation, Methodology, Writing &#x2013; review &amp; editing. YP: Methodology, Software, Writing &#x2013; review &amp; editing. SE: Conceptualization, Formal analysis, Validation, Writing &#x2013; original draft. AE: Conceptualization, Formal analysis, Investigation, Validation, Writing &#x2013; original draft. TE-H: Validation, Formal analysis, Writing &#x2013; review &amp; editing.</p>
</sec>
</body>
<back>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This study was supported by the Russian Science Foundation (grant No. 21-16-00124, <ext-link ext-link-type="uri" xlink:href="https://rscf.ru/en/project/21-16-00124/">https://rscf.ru/en/project/21-16-00124/</ext-link>).</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>The authors extend their appreciation to the Deanship of Scientific Research at King Khalid University and express their gratitude to the ITMO Fellowship and Professorship Program, ITMO University, St. Petersburg, Russia. </p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Afzaal</surname> <given-names>U.</given-names>
</name>
<name>
<surname>Bhattarai</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Pandeya</surname> <given-names>Y. R.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>) <article-title>An Instance Segmentation Model for Strawberry Diseases Based on Mask R-CNN</article-title>. Available at: <uri xlink:href="https://www.kaggle.com/datasets/usmanafzaal/strawberry-disease-detection-dataset/">https://www.kaggle.com/datasets/usmanafzaal/strawberry-disease-detection-dataset/</uri>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s21196565</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Badrinarayanan</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Kendall</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Cipolla</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Segnet: A deep convolutional encoder-decoder architecture for image segmentation</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>39</volume>, <fpage>2481</fpage>&#x2013;<lpage>2495</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barre</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Stover</surname> <given-names>B. C.</given-names>
</name>
<name>
<surname>Muller</surname> <given-names>K. F.</given-names>
</name>
<name>
<surname>Steinhage</surname> <given-names>V.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>LeafNet: A computer vision system for automatic plant species identification</article-title>. <source>Ecol. Inf.</source> <volume>40</volume>, <fpage>50</fpage>&#x2013;<lpage>56</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2017.05.005</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Carion</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Massa</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Synnaeve</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Usunier</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Kirillov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zagoruyko</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>End-to-end object detection with transformers</article-title>,&#x201d; in <source>European conference on computer vision</source>. (<publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>213</fpage>&#x2013;<lpage>229</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Focal ViT: image transformer catches up with CNN on small datasets</article-title>,&#x201d; in <conf-name>International Conference on Computer, Artificial Intelligence, and Control Engineering (CAICE 2023)</conf-name>. (<publisher-loc>Bellingham, Washington</publisher-loc>: <publisher-name>SPIE</publisher-name>) <volume>12645</volume>. <fpage>307</fpage>&#x2013;<lpage>313</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1117/12.2681103</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>L. C.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Papandreou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Schroff</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Encoder-decoder with atrous separable convolution for semantic image segmentation</article-title>&#x201d;, <fpage>801</fpage>&#x2013;<lpage>818</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chouhan</surname> <given-names>S. S.</given-names>
</name>
<name>
<surname>Singh</surname> <given-names>U. P.</given-names>
</name>
<name>
<surname>Jain</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Applications of computer vision in plant pathology: A survey</article-title>. <source>Arch. Computat Methods Eng.</source> <volume>27</volume>, <fpage>611</fpage>&#x2013;<lpage>632</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11831-019-09324-0</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Craze</surname> <given-names>H. A.</given-names>
</name>
<name>
<surname>Pillay</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Joubert</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Berger</surname> <given-names>D. K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Deep learning diagnostics of gray leaf spot in maize under mixed disease field conditions</article-title>. <source>Plants</source> <volume>11</volume>, <fpage>1942</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/plants11151942</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deenan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Janakiraman</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Nagachandrabose</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Image segmentation algorithms for Banana leaf disease diagnosis</article-title>. <source>J. Inst. Eng. Ser. C</source> <volume>101</volume>, <fpage>807</fpage>&#x2013;<lpage>820</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s40032-020-00592-5</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yue</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Li</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Automatic recognition of strawberry diseases and pests using convolutional neural network</article-title>. <source>Smart Agric. Technol.</source> <volume>1</volume>, <elocation-id>100009</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.atech.2021.100009</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Weissenborn</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>. <source>arXiv</source> <volume>2010</volume>, <fpage>11929</fpage>.</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>FlexFormer: Flexible Transformer for efficient visual recognition</article-title>. <source>Pattern Recognition Lett.</source> <volume>169</volume>, <fpage>95</fpage>&#x2013;<lpage>101</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.patrec.2023.03.028</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Avsegformer: Audio-visual segmentation with transformer</article-title>,&#x201d; <source>Proceedings of the AAAI Conference on Artificial Intelligence</source> <volume>38</volume>, <fpage>12155</fpage>&#x2013;<lpage>12163</lpage>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Image recognition of four rice leaf diseases based on deep learning and support vector machine</article-title>. <source>Comput. Educ.</source> <volume>179</volume>, <fpage>105824</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105824</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Shuai</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>A copy paste and semantic segmentation-based approach for the classification and assessment of significant rice diseases</article-title>. <source>Plants</source> <volume>11</volume>, <fpage>3174</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/plants11223174</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Spatial convolutional self-attention-based transformer module for strawberry disease identification under complex background</article-title>. <source>Comput. Electron. Agric.</source> <volume>212</volume>, <elocation-id>108121</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108121</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision, Montreal, QC, Canada</conf-name>. (<publisher-name>IEEE</publisher-name>), <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>.</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Luzi</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Gupta</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Collins</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Bradbury</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Malof</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Transformers for recognition in overhead imagery: A reality check</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision</conf-name>. (<publisher-name>IEEE</publisher-name>), <fpage>3778</fpage>&#x2013;<lpage>3787</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2021.3059968</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Minaee</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Boykov</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Porikli</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Plaza</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kehtarnavaz</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Terzopoulos</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Image segmentation using deep learning: A survey</article-title>. <source>IEEE Trans. Nucl. Sci.</source> <volume>44</volume>, <fpage>3523</fpage>&#x2013;<lpage>3542</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2021.3059968</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Naga Srinivasu</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Srinivasa Rao</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Dicu</surname> <given-names>A. M.</given-names>
</name>
<name>
<surname>Mnerie</surname> <given-names>C. A.</given-names>
</name>
<name>
<surname>Olariu</surname> <given-names>I.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A comparative review of optimisation techniques in segmentation of brain MR images</article-title>. <source>J. Intelligent Fuzzy Systems</source> <volume>38</volume>, <fpage>6031</fpage>&#x2013;<lpage>6043</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3233/JIFS-179688</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Pugoy</surname> <given-names>R. A.</given-names>
</name>
<name>
<surname>Mariano</surname> <given-names>V.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>Automated rice leaf disease detection using color image analysis</article-title>,&#x201d; in <conf-name>Third International Conference on Digital Image Processing</conf-name>, Vol. <volume>8009</volume> (<publisher-loc>Bellingham,WA, USA</publisher-loc>: <publisher-name>SPIE</publisher-name>).</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reedha</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Dericquebourg</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Canals</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Hafiane</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Transformer neural network for weed and crop classification of high resolution UAV images</article-title>. <source>Remote Sens.</source> <volume>14</volume>, <fpage>592</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs14030592</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Revathi</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Hemalatha</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Classification of cotton leaf spot diseases using image processing edge detection techniques</article-title>,&#x201d; in <conf-name>Proceedings of the 2012 International Conference on Emerging Trends in Science, Engineering and Technology (INCOSET), Tiruchirappalli, India, 13&#x2013;14 December 2012, Piscataway, NJ, USA</conf-name>. (<publisher-name>IEEE</publisher-name>), <fpage>169</fpage>&#x2013;<lpage>173</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="web">
<person-group person-group-type="author">
<collab>Roboflow</collab>
</person-group> (<year>2023</year>). Available at: <uri xlink:href="https://universe.roboflow.com/">https://universe.roboflow.com/</uri> (Accessed <access-date>1/10/2023</access-date>).</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-net: Convolutional networks for biomedical image segmentation</article-title>&#x201d; in <source>Medical image computing computer-assisted intervention&#x2013;MICCAI 2015: 18th Int. conference Munich Germany October 5-9 2015 proceedings Part III</source>. (<publisher-name>Springer International Publishing</publisher-name>), <volume>18</volume>, <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shit</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Das</surname> <given-names>D. K.</given-names>
</name>
<name>
<surname>Ray</surname> <given-names>D. N.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Real-time object detection in deep foggy conditions using transformers</article-title>,&#x201d; in <conf-name>2023 3rd International conference on Artificial Intelligence and Signal Processing (AISP)</conf-name>. (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>5</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shruthi</surname> <given-names>U.</given-names>
</name>
<name>
<surname>Nagaveni</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Raghavendra</surname> <given-names>B. K.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>A review on machine learning classification techniques for plant disease detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE 2019 5th international conference on advanced computing &amp; communication systems (ICACCS)</conf-name>. (<publisher-name>IEEE</publisher-name>), <fpage>281</fpage>&#x2013;<lpage>284</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICACCS.2019.8728415</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="web">
<person-group person-group-type="author">
<collab>TRIDGE</collab>
</person-group> (<year>2023</year>). Available at: <uri xlink:href="https://www.tridge.com/intelligences/stawberry/EG">https://www.tridge.com/intelligences/stawberry/EG</uri> (Accessed <access-date>11/11/2023</access-date>).</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>W&#xe4;ldchen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>M&#xe4;der</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Plant species identification using computer vision techniques: A systematic literature review</article-title>. <source>Arch. Computat Methods Eng.</source> <volume>25</volume>, <fpage>507</fpage>&#x2013;<lpage>543</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11831-016-9206-z</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Deep high-resolution representation learning for visual recognition</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>43</volume> (<issue>10</issue>), <fpage>3349</fpage>&#x2013;<lpage>3364</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Rao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Practical cucumber leaf disease recognition using improved Swin Transformer and small sample size</article-title>. <source>Comput. Electron. Agric.</source> <volume>199</volume>, <fpage>107163</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107163</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>K.y.</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>Y.y.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Segmentation of crop disease images with an improved K-means clustering algorithm</article-title>. <source>Appl. Eng. Agric.</source> <volume>34</volume>, <fpage>277</fpage>&#x2013;<lpage>289</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13031/aea.12205</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Cui</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Na</given-names>
</name>
<name>
<surname>Ou</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Early identification of strawberry leaves disease utilizing hyperspectral imaging combing with spectral features, multiple vegetation indices and textural features</article-title>. <source>Comput. Electron. Agric.</source> <volume>204</volume>, <elocation-id>107553</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107553</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Su</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>DS-DETR: A model for tomato leaf disease segmentation and damage evaluation</article-title>. <source>Agronomy</source> <volume>12</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy12092023</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Anandkumar</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Alvarez</surname> <given-names>J. M.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>SegFormer: Simple and efficient design for semantic segmentation with transformers</article-title>. <source>Adv. Neural Inf. Process Syst.</source> <volume>34</volume>, <fpage>12077</fpage>&#x2013;<lpage>12090</lpage>.</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Semantic segmentation of cucumber leaf disease spots based on ECA-SegFormer</article-title>. <source>Agriculture</source> <volume>13</volume> (<issue>8</issue>), <fpage>1513</fpage>.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Ni</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Sung</surname> <given-names>W. K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Deep learning-based segmentation of peach diseases using convolutional neural network</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>876357</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.876357</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yin</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Maize small leaf spot classification based on improved deep convolutional neural networks with a multi-scale attention mechanism</article-title>. <source>Agronomy</source> <volume>12</volume>, <fpage>906</fpage>.</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yong</surname> <given-names>L. Z.</given-names>
</name>
<name>
<surname>Khairunniza-Bejo</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Jahari</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Muharam</surname> <given-names>F. M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Automatic disease detection of basal stem rot using deep learning and hyperspectral imaging</article-title>. <source>Agriculture</source> <volume>13</volume>, <fpage>69</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture13010069</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Cen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Mu</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>CRFormer: Cross-Resolution Transformer for segmentation of grape leaf diseases with context mining</article-title>. <source>Expert Syst. Appl.</source> <volume>229</volume>, <fpage>120324</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2023.120324</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Pyramid scene parsing network</article-title>&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>,  <fpage>2881</fpage>&#x2013;<lpage>2890</lpage>.</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Identification of Leaf-Scale Wheat Powdery Mildew (Blumeria graminis f. sp. Tritici) Combining Hyperspectral Imaging and an SVM Classifier</article-title>. <source>Plants</source> <volume>9</volume>, <fpage>936</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/plants9080936</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Multiple disease detection method for greenhouse-cultivated strawberry based on multiscale feature fusion Faster R_CNN</article-title>. <source>Comput. Electron. Agric.</source> <volume>199</volume>, <elocation-id>107176</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107176</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, Nashville, TN, USA, 20&#x2013;25</conf-name>. (<publisher-name>IEEE</publisher-name>), <fpage>6881</fpage>&#x2013;<lpage>6890</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>