<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1743264</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Precision cotton disease detection via transformer models applied to leaf imagery</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Inamdar</surname>
<given-names>Nikhil</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1760254"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Managuli</surname>
<given-names>Manjunath</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2641065"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Koti</surname>
<given-names>Ramesh</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3037869"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jakati</surname>
<given-names>Jagadish</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<uri xlink:href="https://loop.frontiersin.org/people/3338571"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>P. H.</surname>
<given-names>Sharanappa</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kulkarni</surname>
<given-names>Prasan</given-names>
</name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Electronics and Communication Engineering, KLS Gogte Institute of Technology Belagavi and Affiliated to Visvesvaraya Technological University Belagavi Karnataka</institution>,  <city>Belagavi</city>, <country country="in">India</country></aff>
<aff id="aff2"><label>2</label><institution>School of Computer Science Engineering &#x0026; Applications Engineering, D. Y. Patil International University Pune</institution>, <city>Pune</city>, <state>Maharashtra</state>, <country country="in">India</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Electronics and Communication Engineering, Basaveshwara Engineering College</institution>, <city>Bagalkot</city>, <state>Karnataka</state>, <country country="in">India</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Electronics and Communication Engineering, Anuvartik Mirji Bharatesh Institute of Technology</institution>, <city>Belagavi</city>, <state>Karnataka</state>, <country country="in">India</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Nikhil Inamdar, <email xlink:href="mailto:njinamdar@git.edu">njinamdar@git.edu</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-09">
<day>09</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1743264</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>26</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Inamdar, Managuli, Koti, Jakati, PH and Kulkarni.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Inamdar, Managuli, Koti, Jakati, PH and Kulkarni</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-09">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>There is great potential for improving agricultural research, ecological monitoring, and biodiversity conservation through computerized plant species cataloging utilizing leaf photos. This work introduces a deep learning-based framework that uses transformer-based architectures, such as the Vanilla Vision Transformer (ViT), Swin Transformer, DeiT (Data-Efficient Image Transformer), and T2T-ViT (Tokens-to-Tokens Vision Transformer), to automatically classify cotton leaf diseases. Images of cotton leaves from four different classes&#x2014;curl virus, bacterial blight, fusarium wilt, and healthy leaves&#x2014;make up the dataset. A stratified K-fold hold-out testing technique (K&#x202F;=&#x202F;1 to 5) is used to maintain the class distribution across training and testing folds in order to guarantee robust model evaluation and address class imbalance. To improve generalization and guarantee compatibility with transformer models, standard image augmentation and normalizing approaches are used. All models begin training using vast collections of images, afterward honed specifically on cotton leaf data to sharpen their ability to tell differences apart. Results spread across multiple test rounds stay steady, one standout reaching nearly perfect accuracy&#x2014;99.99 percent. This pattern highlights how transformer-driven systems thrive alongside stratified K-fold checks, crafting a dependable way to spot crop issues early, shifting farm oversight toward quicker, smarter responses.</p>
</abstract>
<kwd-group>
<kwd>CNN</kwd>
<kwd>cotton plant</kwd>
<kwd>disease classification</kwd>
<kwd>image classification</kwd>
<kwd>transformer models</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="16"/>
<table-count count="4"/>
<equation-count count="0"/>
<ref-count count="35"/>
<page-count count="14"/>
<word-count count="7096"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>AI in Food, Agriculture and Water</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>In countries across South Asia&#x2014;such as India, Bangladesh, and parts of southern China&#x2014;farming still anchors daily life and livelihoods (<xref ref-type="bibr" rid="ref20">Patil and Burkpalli, 2021</xref>; <xref ref-type="bibr" rid="ref18">Nadiruzzaman et al., 2021</xref>). Yet shifting weather patterns brought on by warmer climates are worsening outbreaks of plant illnesses, chipping away at harvest yields. Cotton, often called white gold or nature&#x2019;s silk, stands central among cash crops traded globally. Valued near 40 billion USD today, its market may climb toward 60 billion within this decade&#x2019;s end (<xref ref-type="bibr" rid="ref13">Khairnar and Goje, 2020</xref>; <xref ref-type="bibr" rid="ref17">Meyer et al., 2023</xref>). India&#x2019;s textile industry, valued at over 200 billion dollars, stands among the leading forces in global garment exports. Cotton, often seen as the backbone of this sector, grows here more than anywhere else on Earth. The nation leads worldwide in cotton production, fueling much of its fabric output. Rich soil and generations-old expertise help sustain this massive trade. From spinning yarn to finished garments, the work flows through countless hands across villages and cities alike.</p>
<p>India grows cotton across more than 12.9 million hectares, helped along by sunny weather and soil that suits it well. Still, growing this crop faces hurdles&#x2014;blights creep in, bugs invade, rain plays tricks, dry spells hit hard, and temperatures jump without warning. Even so, countless farming families rely on it, pulling through because it keeps food on the table and work in their hands. In China, sicknesses and insects chew away at harvests every year, shaving off 15 to 20 percent of value, sometimes even half when things go wrong. Most troubles plaguing Indian cotton show up first on leaves, dragging yields down by one out of four. Spotting those signs sooner could rescue roughly every fifth plant touched by illness, showing how much depends on catching trouble fast. In places such as Bangladesh and India, farmers often spot cotton problems by sight&#x2014;sometimes aided by specialists. Without enough training, visual checks tend to fall short, demand heavy effort, yet remain common across remote regions. After spotting an issue, growers typically turn to chemical sprays, following guidance from advisors (<xref ref-type="bibr" rid="ref7">Gupta and Pathak, 2016</xref>; <xref ref-type="bibr" rid="ref3">Chen et al., 2020</xref>). Snapshots of leaves, pulling out distinguishing traits, then analyzing them form the core of automatic detection tools. Because they excel at uncovering meaningful patterns, methods rooted in computer vision&#x2014;especially deep learning (DL) and machine learning (ML)&#x2014;have drawn growing interest in studies (<xref ref-type="bibr" rid="ref28">Talukder et al., 2023</xref>; <xref ref-type="bibr" rid="ref24">Saleem et al., 2021</xref>; <xref ref-type="bibr" rid="ref4">Dhaka et al., 2021</xref>).</p>
<p>A study introduced a way to spot sick soybean leaves by combining k-means grouping with SVM classification (<xref ref-type="bibr" rid="ref12">Kaur et al., 2018</xref>). It reached 90% correctness across 4,775 images, targeting issues like bacterial pustule, blight, and mildew&#x2014;using color plus surface patterns to tell healthy from infected. As deep learning advanced, various tests revealed transformers and convolutional networks do better at spotting and outlining crop diseases, especially if plenty of labeled examples exist (<xref ref-type="bibr" rid="ref27">Talukder et al., 2022</xref>; <xref ref-type="bibr" rid="ref31">Uddin et al., 2023</xref>). In particular, Vision Transformers can grasp focused traits through pre-trained knowledge, working well even when only small or niche image sets are accessible. That trait makes them fit for farming uses where data often runs short.</p>
<sec id="sec2">
<label>1.1</label>
<title>Key research findings are enlisted as below</title>
<sec id="sec3">
<label>1.1.1</label>
<title>DL for cotton leaf disease detection</title>
<p>
<list list-type="bullet">
<list-item>
<p>The learning best part the efficiency of modernizer models like Vanilla Vision Transformer (ViT), Swin Transformer, DeiT, and T2T-ViT in classifying cotton leaf diseases.</p>
</list-item>
<list-item>
<p>These models accurately differentiate four categories: curl virus, fusarium wilt, healthy leaves, and bacterial blight.</p>
</list-item>
</list>
</p>
</sec>
<sec id="sec4">
<label>1.1.2</label>
<title>High classification accuracy</title>
<p>
<list list-type="bullet">
<list-item>
<p>The model&#x2019;s performance was enhanced by hyperparameter optimization, reaching an exceptional peak accuracy of 99.99%.</p>
</list-item>
</list>
</p>
</sec>
<sec id="sec5">
<label>1.1.3</label>
<title>Potential for early disease diagnosis</title>
<p>
<list list-type="bullet">
<list-item>
<p>The research shows these tools can spot cotton leaf issues sooner&#x2014;important when quick farming decisions matter. By catching problems earlier, farmers gain time to act before damage spreads through crops.</p>
</list-item>
</list>
</p>
</sec>
<sec id="sec6">
<label>1.1.4</label>
<title>Contribution to agriculture and ecology</title>
<p>
<list list-type="bullet">
<list-item>
<p>The approach pushes progress in farm automation while boosting nature protection&#x2014;woven together through smart tech that learns on its own.</p>
</list-item>
</list>
</p>
</sec>
</sec>
</sec>
<sec id="sec7">
<label>2</label>
<title>Literature survey</title>
<p>Photos snapped under perfect farm conditions help spot plant illnesses&#x2014;researchers have explored this angle plenty. Take wheat rot detection: it demanded shots from above plus close-ups across fields (<xref ref-type="bibr" rid="ref23">Safari et al., 2022</xref>). Before being entered into an organization system to identify the precise class, these photos were annotated for object recognition, with bounding boxes defined and cropped. Five arrangement models utilizing CNN architectures were developed, including VGG16 (<xref ref-type="bibr" rid="ref25">Simonyan and Zisserman, 2014</xref>), ResNet-50 (<xref ref-type="bibr" rid="ref8">He et al., 2016</xref>), Inception (<xref ref-type="bibr" rid="ref26">Szegedy et al., 2016</xref>), MobileNet-V3 (<xref ref-type="bibr" rid="ref9">Howard et al., 2019</xref>), and EfficientNet-B0 (<xref ref-type="bibr" rid="ref29">Tan and Le, 2019</xref>). Out of the five, EfficientNet-B0 proved to be the most accurate and computationally efficient prototype. Convolutional Neural Networks (CNNs) have become more popular in the identification of plant diseases as AI technologies have advanced. For example, in a comparison analysis utilizing the Plant Village dataset, the DenseNet model, which is well-known for its capacity to recycle article maps, achieved a noteworthy precision of 98.27% (<xref ref-type="bibr" rid="ref1">Akshai and Anitha, 2021</xref>; <xref ref-type="bibr" rid="ref10">Hughes and Salathe, 2015</xref>).</p>
<p>An average F1-score of 95.70% has been attained for apple leaf disease recognition training using ResNet networks with residual structures (<xref ref-type="bibr" rid="ref33">Yu et al., 2022</xref>). By eliminating crucial elements from photos, CNNs are quite successful in mechanically categorizing plant illnesses. CNNs&#x2019; parameter-sharing method, which reduces the number of constraints and overfitting&#x2014;a prevalent problem in computer vision tasks&#x2014;is a major advantage. However, there is a chance of unnecessary computational cost as the system&#x2019;s depth increases. Furthermore, CNNs do not explicitly use pixel positioning information, which may limit their capacity to grasp spatial relationships in the image, even when they are successful at extracting local area features through convolutional layers.</p>
<p>Conventional CNNs may be less successful in identifying plant diseases because they frequently have trouble utilizing pixel positional information. Vision Transformers (ViTs), first presented in <xref ref-type="bibr" rid="ref6">Dosovitskiy et al. (2020)</xref>, use a self-attention mechanism as suggested in <xref ref-type="bibr" rid="ref32">Vaswani et al. (2017)</xref> to get around this. Diverse feature extraction from images is made possible by improvements such as incorporating a ghost module into the ViT encoder, as investigated in <xref ref-type="bibr" rid="ref14">Lu et al. (2022)</xref>. Additionally, the MSCVT model presented in <xref ref-type="bibr" rid="ref35">Zhu et al. (2023)</xref> greatly advances the identification of agricultural diseases by combining the advantages of multiscale convolution and self-attention in a hybrid CNN-ViT architecture.</p>
<p>These research ignored tea leaf datasets even though they showed great accuracy using publicly available datasets. ICVT (Inception Convolutional Vision Transformer), which combines the Beginning construction with cross-channel article information, was developed by <xref ref-type="bibr" rid="ref34">Yu et al. (2023)</xref> to close this gap. Additionally, <xref ref-type="bibr" rid="ref30">Thakur et al. (2021)</xref>, <xref ref-type="bibr" rid="ref2">Alharbi et al. (2023)</xref>, <xref ref-type="bibr" rid="ref5">Dhakal et al. (2023)</xref>, <xref ref-type="bibr" rid="ref11">Jenifa et al. (2019)</xref>, <xref ref-type="bibr" rid="ref19">Nigam et al. (2023)</xref>, <xref ref-type="bibr" rid="ref21">Prakash and Managuli (2024a)</xref>, <xref ref-type="bibr" rid="ref15">Managuli et al. (2024)</xref>, <xref ref-type="bibr" rid="ref22">Prakash and Managuli (2024b)</xref>, and <xref ref-type="bibr" rid="ref16">Nikhil et al. (2024)</xref> introduced PlantViT, a transformer-based method specifically designed for accurate plant disease detection.</p>
<p>Researchers explored classic machine learning along with deep learning approaches to spot plant illnesses across multiple projects. Take study (<xref ref-type="bibr" rid="ref12">Kaur et al., 2018</xref>), which introduced an approach for detecting issues on soy leaves through color patterns and texture traits. This method combined k-means grouping with SVM models to classify problems such as downy mildew, bacterial blight, plus frog-eye. It reached 90 percent precision when tested against a collection of 4,775 images.</p>
<p>The strong pattern recognition skills of CNN-driven designs have pushed their broad use in sorting plant illnesses through deep learning frameworks (<xref ref-type="bibr" rid="ref27">Talukder et al., 2022</xref>; <xref ref-type="bibr" rid="ref31">Uddin et al., 2023</xref>). Yet even with success, these networks can stumble under uneven light, messy surroundings, or when sickness signs appear across scattered parts of leaves. Lately, researchers have begun exploring transformer structures for visual challenges. Thanks to wide-ranging focus mechanisms and an ability to build rich contextual insights, models like ViTs, Swin Transformers, DeiT, and T2T deliver promising outcomes across various image classification areas. Still, applying them to detect crop diseases remains limited&#x2014;especially on data covering multiple species.</p>
<sec id="sec8">
<label>2.1</label>
<title>Research gap</title>
<p>Despite progress in automated plant disease detection, gaps remain. While attention has shifted toward newer methods, studies focusing on transformer models for crop illness classification stay sparse&#x2014;particularly those leveraging diverse, multi-species data. Most prior work leans on CNNs or traditional algorithms, which may overlook extended visual patterns crucial when symptoms look nearly identical across types. Rarely do papers line up ViT, Swin, DeiT, and T2T side by side under one testing framework. Cross-species consistency within shared disease families gets little attention. Evaluation often skips thorough metric analysis, tuning depth, or uniform preprocessing steps.</p>
</sec>
</sec>
<sec sec-type="methods" id="sec9">
<label>3</label>
<title>Methodology</title>
<p><xref ref-type="fig" rid="fig1">Figure 1</xref> displays the framework layout of the proposed method. The entire structure consists of four consecutive steps, each built to enhance precision and robustness in the organizational process. First comes pre-processing&#x2014;then deep feature extraction follows. After that, features undergo refinement through optimization. Finally, classification takes place as the last phase.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Block diagram of the proposed study.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart diagram illustrating a machine learning process for classifying plant diseases, starting with a data set used for training and testing, including pre-processing, classifier selection (ViT, SWIN, DieT, T2T-ViT, Cross-ViT), hyperparameter tuning, and classification outputs of bacterial light, curlvirus, fusarium wilt, or healthy, followed by performance evaluation and result comparison based on accuracy, sensitivity, specificity, and recall.</alt-text>
</graphic>
</fig>
<sec id="sec10">
<label>3.1</label>
<title>Pre-processing</title>
<p>To enhance data excellence and guarantee consistency throughout the dataset, the input photos go through pre-processing in the first step. This stage handles tasks like boosting contrast, resizing images, smoothing values, while cutting down random interference. Cleaning things up here removes distractions that could confuse later steps where patterns start to form.</p>
</sec>
<sec id="sec11">
<label>3.2</label>
<title>Extraction of deep features</title>
<p>Once cleaned up, the visuals move into a deep learning setup where features start to emerge. Through a trained or tailored neural net&#x2014;say, a CNN or maybe a Vision Transformer&#x2014;the system picks out distinct, high-layer traits on its own. Instead of handcrafting details, it zeroes in on subtle shapes, surface variations, and layout clues that matter for telling one class apart from another.</p>
</sec>
<sec id="sec12">
<label>3.3</label>
<title>Optimization of features</title>
<p>The refinement stage sharpens the deeply pulled features further. Removing clutter happens through techniques like PCA, smart search strategies, or rank-based filters&#x2014;cutting what is not needed. With fewer distractions, the remaining traits stand out better. Overfitting slips away as complexity drops. Less bulk means faster processing, lighter load.</p>
</sec>
<sec id="sec13">
<label>3.4</label>
<title>Classification</title>
<p>In the final stage, a model takes the refined features to reach distinct decisions. Instead of simple grouping, intelligent patterns guide how inputs are sorted through methods like SVMs, neural nets, or softmax logic. Outcomes emerge directly from these sorting results, shaping what the system ultimately predicts.</p>
<p>The manuscript points to the performance as <xref ref-type="fig" rid="fig2">Figure 2</xref>&#x2014;though it&#x2019;s really labeled <xref ref-type="fig" rid="fig3">Figure 3</xref>. Since mismatches happen, every figure reference should still be double-checked.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Block schematic of vanilla vision transformer model.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating a vision transformer model workflow, showing an input image split into patches, followed by linear projection, transformer blocks with multi-head self-attention, normalization, and final output processing stages.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Classification accuracy considering ViT transformer models.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart showing percentage classification accuracy of a ViT transformer model for various hyperparameters including learning rate, batch size, optimizer, number of epochs, and training testing ratio. Most parameters achieve near 100 percent classification.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec14">
<label>3.5</label>
<title>Dataset</title>
<p>This research targets the organization of cotton plant leaf diseases using transformer models, an area with limited prior exploration compared to other crops. A diverse dataset of leaf images, sourced from studies cited in the literature survey, is utilized. Sample images and associated counts are given in <xref ref-type="table" rid="tab1">Table 1</xref>, highlighting the dataset&#x2019;s composition for effective model training and evaluation.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Sample images from the dataset used.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Sl no</th>
<th align="left" valign="top">Disease type</th>
<th align="center" valign="top">Image</th>
<th align="center" valign="top">No. of images</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">1</td>
<td align="left" valign="middle">Bacterial blight</td>
<td align="center" valign="top">
<inline-graphic xlink:href="frai-08-1743264-i001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Close-up of a green leaf showing signs of disease or damage, with brown spots and discoloration. The leaf has a broad, lobed shape typical of certain plants.</alt-text>
</inline-graphic>
</td>
<td align="center" valign="middle">448</td>
</tr>
<tr>
<td align="left" valign="middle">2</td>
<td align="left" valign="middle">Curl virus</td>
<td align="center" valign="top">
<inline-graphic xlink:href="frai-08-1743264-i002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Green leaf with a distinct, textured surface and pointed edges. The leaf's veins create an intricate pattern, and its vibrant color stands out against a blurred, natural background.</alt-text>
</inline-graphic>
</td>
<td align="center" valign="middle">418</td>
</tr>
<tr>
<td align="left" valign="middle">3</td>
<td align="left" valign="middle">Fussarium_wilt</td>
<td align="center" valign="top">
<inline-graphic xlink:href="frai-08-1743264-i003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Grape leaf with symptoms of discoloration and wilting, showing irregular yellow and brown patches on a green background.</alt-text>
</inline-graphic>
</td>
<td align="center" valign="middle">419</td>
</tr>
<tr>
<td align="left" valign="middle">4</td>
<td align="left" valign="middle">Healthy</td>
<td align="center" valign="top">
<inline-graphic xlink:href="frai-08-1743264-i004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Close-up of a green, heart-shaped leaf with visible veins, set against a background of dry, cracked soil. The leaf appears fresh and healthy.</alt-text>
</inline-graphic>
</td>
<td align="center" valign="middle">426</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Datasets. Rather than collecting new material, this work pulled data from a shared online archive&#x2014;boosting transparency while supporting consistent method evaluation across trials. Its 1,711 images fall into four distinct groups, though some count them as five. Every category includes several samples used for testing and teaching the system. Since every picture came from identical setups, their sharpness and visual clarity matched throughout.</p>
</sec>
<sec id="sec15">
<label>3.6</label>
<title>Pre-processing</title>
<p>The dataset featured leaves from four distinct plants, each image first checked by hand to toss out blurry or broken ones. After sorting, every photo got scaled down&#x2014;no exceptions&#x2014;to fit neatly into 224 by 224 pixels. This standard frame helped the system process visuals without hiccups. Pixel values were then shifted, gently compressed into a range between zero and one, smoothing the path for steady learning. Keep in mind, the dataset pulls together images from four distinct plant groups&#x2014;yet these serve only as origins. Instead of sorting by plant kind, the task focuses on grouping leaves by sickness type. Each category blends specimens from multiple species, tied not by genus but by symptom patterns. This setup pushes the model to pick up on signs of illness that appear across different plants. By mixing varieties within each disease group, it learns traits that stretch beyond a single host.</p>
</sec>
<sec id="sec16">
<label>3.7</label>
<title>Classifiers</title>
<p>A handful of familiar vision transformers&#x2014;Swin, DeiT, T2T-ViT&#x2014;took part in this research; yet the real twist lies in their head-to-head testing across diverse plant species and diseases. Instead of scattered methods, one consistent pipeline handled both prep and analysis, stitching fairness into comparison. Rare in farm-focused machine learning circles, such an approach builds a reproducible frame other can step into. What emerges is not just rankings, but clearer views on which architectural flavors suit sick leaves best.</p>
</sec>
<sec id="sec17">
<label>3.8</label>
<title>Vanilla vision transformer model</title>
<p><xref ref-type="fig" rid="fig4">Figure 4</xref> shows the architecture of a Vision Transformer (ViT), a deep learning model designed to process images by applying transformer architecture. Let us break down each part of the <xref ref-type="fig" rid="fig4">Figure 4</xref> diagram:<list list-type="order">
<list-item>
<p>Input Image and Patch Splitting: The input appearance is separated into slighter, fixed-size covers, each compressed into a vector &#x0026; treated as a separate &#x201C;token,&#x201D; similar to words in NLP models.</p>
</list-item>
<list-item>
<p>Linear Projection of Flattened Patches: Apiece flattened patch is expected into an implanting interplanetary, starting an order of embedding, which serves as input to the transformer blocks (similar to word embedding in NLP).</p>
</list-item>
<list-item>
<p>Transformer Blocks (repeated N times): The ViT architecture consists of numerous modifier blocks, each with the following components:</p>
</list-item>
</list><list list-type="bullet">
<list-item>
<p>Multi-Head Self Attention (MHA): Multiple attention heads calculate attention scores, capturing interactions among patches.</p>
</list-item>
<list-item>
<p>Normalization (NORM): Normalization layers are applied before and after attention layers to stabilize training.</p>
</list-item>
<list-item>
<p>MLP (Multi-Layer Perceptron): Binary bits with a starting meaning are processed by a feedforward neural network.</p>
</list-item>
<list-item>
<p>Residual Connections (+): Additional skip connections between layers improve gradient flow and stop the vanishing gradient issue.</p>
</list-item>
</list></p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Block schematic of swin transformer model.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Two diagram panels compare neural network model architectures for image processing. The upper panel shows a four-stage Swin Transformer pipeline with Patch Partition, Linear Embedding, Swin Transformer Blocks, Patch Merging, and a Multi-Layer Perceptron composed of fully connected, activation, and dropout layers. The lower panel adds Residual Blocks before each Swin Transformer Block in every stage, depicted with additional connections, and follows a similar pipeline structure, ending with an MLP.</alt-text>
</graphic>
</fig>
<sec id="sec18">
<label>3.8.1</label>
<title>Multi-head self attention (MHA) block</title>
<p>The MHA block includes multiple self-attention &#x201C;heads,&#x201D; which help the model capture different aspects of relationships between patches.<list list-type="bullet">
<list-item>
<p>Q, K, V (Query, Key, Value): These are linear transformations of the input patch embeddings, which are used to calculate attention scores between different patches.</p>
</list-item>
<list-item>
<p>Concatenate and Linear: The output from each attention head is concatenated and passed through a linear layer to combine information from all heads.</p>
</list-item>
</list></p>
</sec>
<sec id="sec19">
<label>3.8.2</label>
<title>Self-attention block</title>
<p>This block shows how self-attention is calculated within each head.<list list-type="bullet">
<list-item>
<p>MatMul and SoftMax: The Query and Key vectors are multiplied (MatMul) to calculate attention scores, and SoftMax is applied to normalize these scores.</p>
</list-item>
<list-item>
<p>Output Calculation: The attention scores are then used to weigh the Value vectors, generating the final output for each attention head.</p>
</list-item>
</list></p>
<p><xref ref-type="fig" rid="fig2">Figure 2</xref> shows the performance of the Viet modifier for the dataset considered.</p>
</sec>
<sec id="sec20">
<label>3.8.3</label>
<title>Swin transformer</title>
<p>This image illustrates the architecture of a Swin Transformer, a type of Vision Transformer specifically designed for handling visual tasks in a hierarchical and efficient manner. The architecture diagram, as shown in <xref ref-type="fig" rid="fig5">Figure 5</xref>, is divided into two main parts, showing two configurations of the Swin Transformer. Let us go through each section:</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Classification accuracy considering Swin transformer models.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart displaying the classification accuracy of the Swin transformer model under varying hyperparameters. Highest accuracy, near one hundred percent, is achieved with optimizer selection, number of epochs above ten, and higher training-testing ratios. Lower accuracy appears with fewer epochs, larger batch size, and specific learning rates.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec21">
<label>3.8.4</label>
<title>Overview of Swin transformer stages</title>
<p>
<list list-type="order">
<list-item>
<p>Patch Partitioning: The input appearance is separated into covers, where each patch represents a small region of the image.</p>
</list-item>
<list-item>
<p>Linear Embedding: Each patch is compressed and linearly predictable to make a lower-dimensional embedding, similar to the process in Vision Transformers.</p>
</list-item>
<list-item>
<p>Stage Processing with Swin Transformer Blocks: The architecture is organized into four stages, each with a different number of Swin Transformer blocks:</p>
<list list-type="bullet">
<list-item>
<p>Swin Transformer Block: Each block processes the input embedding using window-based self-attention, where attention is computed within local windows instead of the entire image.</p>
</list-item>
<list-item>
<p>Patch Merging: After each stage, patch merging reduces the number of patches by combining neighboring patches, increasing the receptive field as the network deepens. The number of Swin Transformer blocks increases in each stage, allowing for more complex representations.</p>
</list-item>
</list>
</list-item>
<list-item>
<p>MLP Layer: After the fourth stage, the mined topographies are approved through an MLP skull, consisting of completely linked layers, beginning functions, and dropout for classification tasks.</p>
</list-item>
</list>
</p>
<p><xref ref-type="fig" rid="fig3">Figure 3</xref> shows the performance of the swin converter for the dataset considered.</p>
</sec>
<sec id="sec22">
<label>3.8.5</label>
<title>DeiT (data-efficient image transformer)</title>
<p>The DeiT is a variation of the ViT designed to recover the replica&#x2019;s competence and accuracy, especially when training on smaller datasets Created in 2021 by Facebook AI, DeiT rolled out methods that ease reliance on huge data loads typical of standard transformers&#x2014;suddenly, large-scale pre-training wasn&#x2019;t a strict requirement. Instead of leaning on vast datasets, it leaned into smarter training tricks. These opened doors for wider adoption, especially where computing power or data access is limited. Efficiency became its quiet strength.</p>
</sec>
<sec id="sec23">
<label>3.8.6</label>
<title>Distillation token</title>
<p>
<list list-type="bullet">
<list-item>
<p>The DeiT setup adds a distillation token along with the usual CLS marker&#x2014;two tokens moving through layers together, each playing distinct roles during training. One guides label-based learning, while the other absorbs knowledge from a teacher network, shaping how features evolve across blocks.</p>
</list-item>
<list-item>
<p>A teacher model, often a CNN or large pretrained transformer, guides the student during training through the distillation token, shaping its progress more effectively while streamlining knowledge transfer.</p>
</list-item>
<list-item>
<p>Through smooth knowledge sharing, the teacher model offers fuzzy guesses for each category&#x2014;helping DeiT adapt better when data is limited.</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>1. Training on Smaller Datasets:</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p>DeiT works better when data is limited, whereas the first ViT relies on massive sets&#x2014;JFT-300&#x202F;M or ImageNet-21&#x202F;k&#x2014;to learn effectively beforehand</p>
</list-item>
<list-item>
<p>A single million-image dataset like ImageNet-1&#x202F;k can still push DeiT forward&#x2014;especially once the distillation token steps in, trimming reliance on massive data loads while holding its own in results.</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>2. Multi-Head Self-Attention and Feedforward Layers:</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p>Similar to ViT, DeiT maintains the basic transformer structure with multi-head self-attention and feedforward layers.</p>
</list-item>
<list-item>
<p>Its typical architecture uses positional encodings, fixed-size picture patches, and a number of transformer encoder layers.</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>3. Efficient Training Techniques:</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p>To lessen overfitting, DeiT uses a number of training improvements, including data augmentation, regularization, and stochastic depth, which randomly removes pathways from the model during training.</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>4. Loss Function:</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p>DeiT combines the standard cross-entropy loss with a distillation loss. The cross-entropy loss is computed for the organization token, while the distillation loss is computed for the distillation token, based on the teacher&#x2019;s predictions.</p>
</list-item>
<list-item>
<p>This dual-loss approach encourages the model to balance both the actual label predictions and the soft label guidance from the teacher.</p>
</list-item>
</list>
</p>
<p><xref ref-type="fig" rid="fig6">Figure 6</xref> shows the performance of the swin modifier for the dataset considered.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Classification accuracy considering DieT transformer models.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart titled &#x201C;Performance of DieT transformer model&#x201D; compares classification accuracy for different hyper-parameters. Most settings yield near 100 percent except learning rate 1.00E-04 and epochs 20, which are lower.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec24">
<label>3.8.7</label>
<title>Token to token transformer</title>
<p><xref ref-type="fig" rid="fig7">Figure 7</xref> depicts the architecture of the T2T-ViT (Tokens-to-Tokens Vision Transformer), which is designed to enhance the Vision Transformer (ViT) by capturing better local structure information within the input images.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Block schematic of T2T transformer model.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Block diagram illustrating the T2T-ViT model architecture, showing an input image being unfolded, processed sequentially through tokens-to-token transformers, fixed tokeners, T2T transformer blocks, and an MLP head for classification output.</alt-text>
</graphic>
</fig>
<p>Here&#x2019;s a breakdown of the architecture as shown in the diagram:<list list-type="simple">
<list-item>
<p>1. Input Image (224 &#x00D7; 224 pixels):</p>
</list-item>
</list><list list-type="bullet">
<list-item>
<p>The T2T-ViT model begins with an input image of size 224 &#x00D7; 224 pixels.</p>
</list-item>
<list-item>
<p>To capture additional spatial information in each location, this image is then segmented into overlapping patches rather than normal non-overlapping patches.</p>
</list-item>
</list><list list-type="simple">
<list-item>
<p>2. Tokens-to-Tokens (T2T) Module:</p>
</list-item>
</list><list list-type="bullet">
<list-item>
<p>The input image is converted by the T2T module into tokens, which are the basic units supplied to the transformer.</p>
</list-item>
<list-item>
<p>It operates in multiple stages to progressively merge tokens, thereby creating a hierarchy of tokens that better represents the image&#x2019;s local information.</p>
</list-item>
</list><list list-type="simple">
<list-item>
<p>3. Transformer Backbone:</p>
</list-item>
</list><list list-type="bullet">
<list-item>
<p>After processing in the T2T module, the tokens are passed to the main Transformer backbone.</p>
</list-item>
<list-item>
<p>This backbone consists of various transformer layers, each with multi-head self-attention mechanisms and feed-forward networks.</p>
</list-item>
<list-item>
<p>Each layer processes the tokens, allowing the typical to imprisonment worldwide situation and relations across different image regions.</p>
</list-item>
<list-item>
<p>Positional Encoding (PE): To maintain spatial information, positional encodings are added to the tokens since transformers alone do not inherently understand spatial positioning.</p>
</list-item>
</list><list list-type="simple">
<list-item>
<p>4. MLP Head:</p>
</list-item>
</list><list list-type="bullet">
<list-item>
<p>After passing through the transformer layers, the tokens are directed to the MLP Head for organization.</p>
</list-item>
<list-item>
<p>The output from the MLP head classifies the image into one of the target categories in a classification task.</p>
</list-item>
</list></p>
<p><xref ref-type="fig" rid="fig8">Figures 8</xref>, <xref ref-type="fig" rid="fig9">9</xref> show the performance of the T2T transformer for the dataset considered.</p>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>Classification accuracy considering T2T transformer models.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g008.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart showing the performance of a token transformer model using various hyper-parameters, with classification accuracy ranging from 99 to 100 percent. Best performance is achieved with SGD and RMSprop optimizers, as well as with a batch size of eight and a training-testing ratio of eighty to twenty. Hyper-parameters compared include learning rate, batch size, optimizer, number of epochs, and training-testing ratio.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>T2T matrices.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g009.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart titled "Per-class metrics - t2t" displays precision, recall, and F1-score for classes bacterial_blight, curl_virus, fusarium_wilt, and healthy. Precision is highest for fusarium_wilt, recall peaks for healthy, and all metrics are lowest for curl_virus.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="results" id="sec25">
<label>4</label>
<title>Results and discussion</title>
<p>Instead of depending only on a single train-test split, the typical approach was further validated to guarantee a more trustworthy and objective performance evaluation. Because the model is tested across several data partitions, this method offers a more thorough assessment of model stability and generalization. The average performance over all folds is represented by the final reported results.</p>
<p>Such automated disease detection systems can possibly enable large-scale crop health monitoring and contribute to broader agricultural sustainability efforts, even if the main focus of this education is the categorization of cotton leaf diseases using transformer-based models. However, ecological monitoring and biodiversity conservation are only listed as potential uses for further research; they are not directly evaluated.</p>
<p>Preprocessing steps, including image augmentation and normalization, ensure consistency and quality for model input. Transformer models such Vanilla Vision Transformer (ViT), Swin Transformer, DeiT, and T2T-ViT are used; hyper-parameter tuning is used to determine which model performs the best. To demonstrate the superiority of the suggested method, performance is assessed using common classification criteria, and findings are contrasted with those of current research. To examine model behavior and effectiveness in cotton leaf disease detection, extensive trials are carried out, backed by confusion matrices and comprehensive classification reports.</p>
<p>The hyperparameters that led to perfect classification on this particular dataset appear in <xref ref-type="table" rid="tab2">Table 2</xref>. Despite their earlier mention in the methodology, results for the token-to-token transformer were left out of <xref ref-type="table" rid="tab3">Table 3</xref>&#x2014;accuracy fell short of the full mark.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Classification report.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Name</th>
<th align="center" valign="top">Precision</th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">F1-Score</th>
<th align="center" valign="top">Support</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Curl Virus</td>
<td align="char" valign="top" char=".">0.84</td>
<td align="char" valign="top" char=".">0.81</td>
<td align="char" valign="top" char=".">0.83</td>
<td align="center" valign="top">315</td>
</tr>
<tr>
<td align="left" valign="top">Bacterial Blight</td>
<td align="char" valign="top" char=".">0.85</td>
<td align="char" valign="top" char=".">0.83</td>
<td align="char" valign="top" char=".">0.84</td>
<td align="center" valign="top">335</td>
</tr>
<tr>
<td align="left" valign="top">Fusarium wilt</td>
<td align="char" valign="top" char=".">0.92</td>
<td align="char" valign="top" char=".">0.93</td>
<td align="char" valign="top" char=".">0.93</td>
<td align="center" valign="top">325</td>
</tr>
<tr>
<td align="left" valign="top">Healthy</td>
<td align="char" valign="top" char=".">0.82</td>
<td align="char" valign="top" char=".">0.86</td>
<td align="char" valign="top" char=".">0.84</td>
<td align="center" valign="top">320</td>
</tr>
<tr>
<td align="left" valign="top">Accuracy</td>
<td/>
<td/>
<td align="char" valign="top" char=".">0.86</td>
<td align="center" valign="top">1,295</td>
</tr>
<tr>
<td align="left" valign="top">Macro avg</td>
<td align="char" valign="top" char=".">0.86</td>
<td align="char" valign="top" char=".">0.86</td>
<td align="char" valign="top" char=".">0.86</td>
<td align="center" valign="top">1,295</td>
</tr>
<tr>
<td align="left" valign="top">Weigted avg</td>
<td align="char" valign="top" char=".">0.86</td>
<td align="char" valign="top" char=".">0.86</td>
<td align="char" valign="top" char=".">0.86</td>
<td align="center" valign="top">1,295</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Hyper parameters selected for optimal performance.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Name</th>
<th align="center" valign="top">ViT</th>
<th align="center" valign="top">Swin</th>
<th align="center" valign="top">DeiT</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="bottom">Learning rate</td>
<td align="center" valign="middle">1.00E-05</td>
<td align="center" valign="middle">1.00E-05</td>
<td align="center" valign="middle">1.00E-05</td>
</tr>
<tr>
<td align="left" valign="bottom">Batch size</td>
<td align="center" valign="bottom">16</td>
<td align="center" valign="bottom">8</td>
<td align="center" valign="bottom">16</td>
</tr>
<tr>
<td align="left" valign="bottom">Optimizer</td>
<td align="center" valign="bottom">SGD</td>
<td align="center" valign="bottom">SGD</td>
<td align="center" valign="bottom">SGD</td>
</tr>
<tr>
<td align="left" valign="bottom">No of epochs</td>
<td align="center" valign="bottom">20</td>
<td align="center" valign="bottom">20</td>
<td align="center" valign="bottom">5</td>
</tr>
<tr>
<td align="left" valign="bottom">Training testing ratio</td>
<td align="center" valign="bottom">70:30</td>
<td align="center" valign="bottom">71:30</td>
<td align="center" valign="bottom">72:30</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The confusion matrix breaks down the prototype&#x2019;s performance, sorting outcomes into true negatives, true positives, along with misjudged cases&#x2014;false negatives and false positives. These categories spotlight where errors tend to cluster. Meanwhile, the ROC curve shows how sensitivity shifts against false alarms as thresholds change. Greater AUC points toward clearer separation between classes. The AUC reflects how well a model performs across thresholds. As shown in <xref ref-type="table" rid="tab4">Table 4</xref>, <xref ref-type="fig" rid="fig10">Figure 10</xref> together with <xref ref-type="fig" rid="fig11">Figure 11</xref> displays confusion matrices alongside ROC plots for multiple transformer setups paired with specific classifiers&#x2014;revealing uneven results in classification tasks.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Classification report of Deit.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Name</th>
<th align="center" valign="top">Precision</th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">F1-Score</th>
<th align="center" valign="top">Support</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Curl Virus</td>
<td align="char" valign="top" char=".">0.99</td>
<td align="char" valign="top" char=".">0.98</td>
<td align="char" valign="top" char=".">0.98</td>
<td align="center" valign="top">335</td>
</tr>
<tr>
<td align="left" valign="top">Bacterial Blight</td>
<td align="char" valign="top" char=".">0.96</td>
<td align="char" valign="top" char=".">0.96</td>
<td align="char" valign="top" char=".">0.96</td>
<td align="center" valign="top">315</td>
</tr>
<tr>
<td align="left" valign="top">Fusarium wilt</td>
<td align="char" valign="top" char=".">0.96</td>
<td align="char" valign="top" char=".">1.0</td>
<td align="char" valign="top" char=".">0.98</td>
<td align="center" valign="top">325</td>
</tr>
<tr>
<td align="left" valign="top">Healthy</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="char" valign="top" char=".">0.96</td>
<td align="char" valign="top" char=".">0.96</td>
<td align="center" valign="top">320</td>
</tr>
<tr>
<td align="left" valign="top">Accuracy</td>
<td/>
<td/>
<td align="char" valign="top" char=".">0.97</td>
<td align="center" valign="top">1,295</td>
</tr>
<tr>
<td align="left" valign="top">Macro avg</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="center" valign="top">1,295</td>
</tr>
<tr>
<td align="left" valign="top">Weigted avg</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="center" valign="top">1,295</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="fig10">
<label>Figure 10</label>
<caption>
<p>The classification report&#x2019;s confusion matrix.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g010.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Confusion matrix for disease classification using the deit model, with four classes: bacterial blight, curl virus, fusarium wilt, and healthy. Most predictions align with the true classes, indicating high accuracy.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig11">
<label>Figure 11</label>
<caption>
<p>Confusion matrix and RUC curve of transformer models.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g011.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Confusion matrix on the left shows perfect classification among four classes: bacterial blight, curl virus, fusarium wilt, and healthy, with no misclassifications. ROC curve on the right displays four colored lines representing the classes, each achieving an area under the curve of one point zero zero, indicating perfect classifier performance.</alt-text>
</graphic>
</fig>
<p>Although accuracy offers a general indicator of accurate predictions, it might not adequately represent how the model behaves in other classes. Thus, we assessed the model using precision, recall, F1-score, confusion matrix, and ROC-AUC in addition to accuracy. The F1-score balances both metrics, the confusion matrix offers comprehensive insights into misclassification patterns, and precision and recall aid in evaluating class-wise performance. These extra criteria guarantee a more thorough and dependable evaluation of the model&#x2019;s performance in <xref ref-type="fig" rid="fig12">Figures 12</xref>, <xref ref-type="fig" rid="fig13">13</xref>.</p>
<fig position="float" id="fig12">
<label>Figure 12</label>
<caption>
<p>Per class metrics-Deit.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g012.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Grouped bar chart displaying per-class metrics for a model labeled "deit," with precision, recall, and F1-score values for bacterial blight, curl virus, fusarium wilt, and healthy categories, all near or at 1.0.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig13">
<label>Figure 13</label>
<caption>
<p>Confusion matrix Deit.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g013.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Confusion matrix heatmap titled &#x201C;Confusion Matrix - deit&#x201D; compares actual versus predicted labels for four categories: bacterial blight, curl virus, fussarium wilt, and healthy. Each diagonal cell shows high values, indicating correct classifications, with color intensity ranging from light blue (lower values) to dark blue (higher values). A color scale on the right ranges from zero to over three hundred.</alt-text>
</graphic>
</fig>
<p>There are a number of reasons why the transformer-based versions perform better. First, self-attention methods used by transformer architectures like ViT, Swin Transformer, DeiT, and T2T enable the model to extract global contextual information and long-range dependencies from leaf images. Because disease indicators, including discoloration, texture changes, and uneven patterns, are frequently dispersed among several leaf sections, this ability is especially helpful for disease diagnosis <xref ref-type="fig" rid="fig14">Figure 14</xref>.</p>
<fig position="float" id="fig14">
<label>Figure 14</label>
<caption>
<p>Confusion matrix of ViT.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g014.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Confusion matrix heatmap for a classification model labeled vit, showing actual versus predicted classes for bacterial blight, curl virus, fusarium wilt, and healthy samples, with correct classifications along the diagonal.</alt-text>
</graphic>
</fig>
<p>Second, transformers dynamically learn associations between all picture patches, allowing for better feature representation and enhanced disease class discrimination, in contrast to typical CNNs that rely on fixed-size convolutional kernels. Higher resilience results from this, particularly in datasets where different plant species may exhibit comparable illnesses.</p>
<p>Third, the remarkably high accuracy attained in our tests is a result of the enhanced pre-processing procedures and hyper parameter tuning, which further improve model generalization. Additionally, patch embedding and multi-head attention give transformer models strong regularization capabilities that assist in minimize overfitting even when working with a complex dataset that includes 4 different leaf types in <xref ref-type="fig" rid="fig15">Figures 15</xref>, <xref ref-type="fig" rid="fig16">16</xref>.</p>
<fig position="float" id="fig15">
<label>Figure 15</label>
<caption>
<p>Confusion matrix of Swin.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g015.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Confusion matrix for a swin model showing predicted versus actual classes: bacterial blight, curl virus, fusarium wilt, and healthy. Most classifications are on the diagonal, indicating high accuracy. Color intensity reflects value magnitude.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig16">
<label>Figure 16</label>
<caption>
<p>Per class metrics-Swin.</p>
</caption>
<graphic xlink:href="frai-08-1743264-g016.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart comparing per-class metrics for a swin model across four categories: bacterial blight, curl virus, fusarium wilt, and healthy. Precision, recall, and F1-score are all near one point zero for each class.</alt-text>
</graphic>
</fig>
<sec id="sec26">
<label>4.1</label>
<title>Discussion of the proposed model</title>
<p>The dataset utilized in this study comprises 1,711 images categorized into four classes of cotton plant leaf diseases. Convolutional neural networks (CNNs) and other traditional machine learning and deep learning techniques have been widely investigated for automated plant disease identification using leaf imagery. Although CNN-based models have shown encouraging results, their dependence on limited receptive fields frequently restricts their capacity to grasp global contextual information and long-range dependencies found in intricate leaf disease patterns. Hybrid approaches and ensemble learning have been used in recent studies to try to overcome these restrictions, however they often increase computer complexity and rely on manually created feature engineering. The suggested study, on the other hand, makes use of transformer-based architectures, such as ViT, Swin Transformer, DeiT, and T2T-ViT, which simulate global interactions over the entire image by using self-attention mechanisms. This capability shines when sorting cotton leaf issues, since texture shifts, warped veins, or odd colors often appear scattered across the surface. Earlier efforts usually tested one model setup&#x2014;here, several transformers are measured side by side under identical conditions. What also sets this work apart lies in how it checks results. A lot of newer studies rely on just one training and testing division, which can give a shaky sense of reliability, especially if there aren&#x2019;t many samples to begin with. To get around that issue&#x2014;and keep class proportions steady while sharpening accuracy&#x2014;the approach uses repeated stratified splits, cycling through five separate validation rounds. This validation strengthens credibility while broadening real-world relevance. Rather than pitting findings against unrelated data pools, the approach sidesteps shaky number-crunching, focusing instead on how methods shape outcomes. Context emerges through emphasis&#x2014;on structural advantages, scrutiny in testing, and actual field utility&#x2014;not just raw scores. Taken together, the transformer framework shows strong potential for spotting cotton leaf issues with consistency, supporting timely responses in farming systems. Multiple transformer variants were weighed with care, adding meaningful detail to ongoing exploration of visual models in smart crop monitoring.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec27">
<label>5</label>
<title>Conclusion</title>
<p>This research introduced a transformer-driven system capable of automatically detecting cotton leaf illnesses through farm-related image data. Instead of traditional methods, it relied on Vision Transformers&#x2014;paired with a thorough preparation process&#x2014;to capture signs of disease in foliage images. For fair testing and even representation across categories, performance checks used stratified k-fold splits (ranging from k&#x202F;=&#x202F;1 up to k&#x202F;=&#x202F;5), alongside a separate reserve dataset. Results held strong throughout each fold, hitting a near-perfect 99.99% accuracy rate&#x2014;not just once, but every time. This steady performance reveals the model&#x2019;s toughness, proving it works where farming happens. Transformer-driven setups handle nearby and wide-ranging influences just right&#x2014;key for telling plant illnesses apart accurately. All together, the study backs using high-level neural methods in smart farming, suggesting they might sharpen early warnings, guide choices, support healthier crops without waste. Coming work could pull in transparent AI tools, test smaller transformers on live data streams, adapt the setup to broader, richer collections of field records to make outputs clearer, earn grower confidence.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec28">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="sec29">
<title>Author contributions</title>
<p>NI: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. MM: Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. RK: Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. JJ: Writing &#x2013; review &#x0026; editing, Validation. SP: Writing &#x2013; review &#x0026; editing, Validation. PK: Writing &#x2013; review &#x0026; editing, Visualization.</p>
</sec>
<sec sec-type="COI-statement" id="sec30">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec31">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec32">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Akshai</surname><given-names>K.</given-names></name> <name><surname>Anitha</surname><given-names>J.</given-names></name></person-group> (<year>2021</year>). &#x201C;Plant disease classification using deep learning.&#x201D; in <italic>2021 3rd international conference on signal processing and communication (ICPSC) (IEEE)</italic>. pp. 407&#x2013;411.</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alharbi</surname><given-names>A.</given-names></name> <name><surname>Khan</surname><given-names>M. U. G.</given-names></name> <name><surname>Tayyaba</surname><given-names>B.</given-names></name></person-group> (<year>2023</year>). <article-title>Wheat disease classification using continual learning</article-title>. <source>IEEE Access</source> <volume>11</volume>:<fpage>4358</fpage>. doi: <pub-id pub-id-type="doi">10.1109/access.2023.3304358</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>P.</given-names></name> <name><surname>Xiao</surname><given-names>Q.</given-names></name> <name><surname>Zhang</surname><given-names>J.</given-names></name> <name><surname>Xie</surname><given-names>C.</given-names></name> <name><surname>Wang</surname><given-names>B.</given-names></name></person-group> (<year>2020</year>). <article-title>Occurrence prediction of cotton pests and diseases by bidirectional long short-term memory networks with climate and atmosphere circulation</article-title>. <source>Comput. Electron. Agric.</source> <volume>176</volume>:<fpage>105612</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compag.2020.105612</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dhaka</surname><given-names>V. S.</given-names></name> <name><surname>Meena</surname><given-names>S. V.</given-names></name> <name><surname>Rani</surname><given-names>G.</given-names></name> <name><surname>Sinwar</surname><given-names>D.</given-names></name> <name><surname>Ijaz</surname><given-names>M. F.</given-names></name> <name><surname>Wozniak</surname><given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>A survey of deep convolutional neural networks applied for prediction of plant leaf diseases</article-title>. <source>Sensors</source> <volume>21</volume>:<fpage>4749</fpage>.</mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dhakal</surname><given-names>K.</given-names></name> <name><surname>Sivaramakrishnan</surname><given-names>U.</given-names></name> <name><surname>Zhang</surname><given-names>X.</given-names></name> <name><surname>Belay</surname><given-names>K.</given-names></name> <name><surname>Oakes</surname><given-names>J.</given-names></name> <name><surname>Wei</surname><given-names>X.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Machine learning analysis of hyperspectral images of damaged wheat kernels</article-title>. <source>Sensors</source> <volume>23</volume>:<fpage>3523</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s23073523</pub-id>, <pub-id pub-id-type="pmid">37050581</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname><given-names>A.</given-names></name> <name><surname>Beyer</surname><given-names>L.</given-names></name> <name><surname>Kolesnikov</surname><given-names>A.</given-names></name> <name><surname>Weissenborn</surname><given-names>D.</given-names></name> <name><surname>Zhai</surname><given-names>X.</given-names></name> <name><surname>Unterthiner</surname><given-names>T.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gupta</surname><given-names>A.</given-names></name> <name><surname>Pathak</surname><given-names>H.</given-names></name></person-group> (<year>2016</year>). <source>Climate change and agriculture in India</source>. Proceedings of the World Anthropology Congress 2023 (WAC 2023) <publisher-loc>New Delhi</publisher-loc>.</mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>He</surname><given-names>K.</given-names></name> <name><surname>Zhang</surname><given-names>X.</given-names></name> <name><surname>Ren</surname><given-names>S.</given-names></name> <name><surname>Sun</surname><given-names>J.</given-names></name></person-group> (<year>2016</year>). &#x201C;Deep residual learning for image recognition.&#x201D; in: <italic>Proceedings of the IEEE conference on computer vision and pattern recognition</italic>, pp. 770&#x2013;778.</mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Howard</surname><given-names>A.</given-names></name> <name><surname>Sandler</surname><given-names>M.</given-names></name> <name><surname>Chu</surname><given-names>G.</given-names></name> <name><surname>Chen</surname><given-names>L.C.</given-names></name> <name><surname>Chen</surname><given-names>B.</given-names></name> <name><surname>Tan</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2019</year>). &#x201C;Searching for mobilenetv3&#x201D;. in <italic>Proceedings of the IEEE/CVF international conference on computer vision.</italic> pp. 1314&#x2013;1324.</mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hughes</surname><given-names>D.</given-names></name> <name><surname>Salathe</surname><given-names>M.</given-names></name></person-group> (<year>2015</year>). <article-title>An open-access repository of images on plant health to enable the development of mobile disease diagnostics</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1511.08060</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Jenifa</surname><given-names>A.</given-names></name> <name><surname>Ramar</surname><given-names>Ramalakshmi</given-names></name> <name><surname>Veerachamy</surname><given-names>Ramachandran</given-names></name></person-group>. (<year>2019</year>). Cotton leaf disease classification using deep convolution neural network for sustainable cotton production. 1&#x2013;3.</mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kaur</surname><given-names>S.</given-names></name> <name><surname>Pandey</surname><given-names>S.</given-names></name> <name><surname>Goel</surname><given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>Semi-automatic leaf disease detection and classi f ication system for soybean culture</article-title>. <source>IET Image Process.</source> <volume>12</volume>, <fpage>1038</fpage>&#x2013;<lpage>1048</lpage>.</mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Khairnar</surname><given-names>K.</given-names></name> <name><surname>Goje</surname><given-names>N.</given-names></name></person-group> (<year>2020</year>). <article-title>Proceedings of the World Anthropology Congress 2023 (WAC 2023)</article-title> (<publisher-loc>Bhubaneswar</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>55</fpage>&#x2013;<lpage>65</lpage>.</mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname><given-names>X.</given-names></name> <name><surname>Yang</surname><given-names>R.</given-names></name> <name><surname>Zhou</surname><given-names>J.</given-names></name> <name><surname>Jiao</surname><given-names>J.</given-names></name> <name><surname>Liu</surname><given-names>F.</given-names></name> <name><surname>Liu</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A hybrid model of ghost-convolution enlightened transformer for effective diagnosis of grape leaf disease and pest</article-title>. <source>J. King Saud Univ. Comput. Inf. Sci.</source> <volume>34</volume>, <fpage>1755</fpage>&#x2013;<lpage>1767</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jksuci.2022.03.006</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Managuli</surname><given-names>M.</given-names></name> <name><surname>Chandramohan</surname><given-names>K. P.</given-names></name> <name><surname>Marimuthu</surname><given-names>M.</given-names></name> <name><surname>Duraisamy</surname><given-names>K.</given-names></name> <name><surname>Rajendran</surname><given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Buffer association of network on chip (NoC) using simulated network</article-title>. <source>Ing&#x00E9;nierie des Syst&#x00E8;mes d&#x2019;Information</source> <volume>29</volume>, <fpage>1797</fpage>&#x2013;<lpage>1807</lpage>. doi: <pub-id pub-id-type="doi">10.18280/isi.290513</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nikhil</surname><given-names>I.</given-names></name> <name><surname>Managuli</surname><given-names>M.</given-names></name> <name><surname>Koti</surname><given-names>R.</given-names></name> <name><surname>Jakati</surname><given-names>J.</given-names></name> <name><surname>Sharanappa</surname><given-names>P. H.</given-names></name> <name><surname>Kulkarni</surname><given-names>P. D</given-names></name></person-group>. (<year>2024</year>). <article-title>Recoverable data hiding in encrypted images through extent reversing before inscription</article-title>. <source>J. Inst. Eng. India Ser. B</source>. doi: <pub-id pub-id-type="doi">10.1007/s40031-024-01145-5</pub-id></mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Meyer</surname><given-names>L.</given-names></name> <name><surname>Dew</surname><given-names>T.</given-names></name> <name><surname>Grace</surname><given-names>M.</given-names></name> <name><surname>Lanclos</surname><given-names>K.</given-names></name> <name><surname>MacDonald</surname><given-names>S.</given-names></name> <name><surname>Soley</surname><given-names>G.</given-names></name></person-group> (<year>2023</year>). The world and United States cotton outlook. In U.S. department of agriculture. Agricultural outlook forum 2023 presented Friday, February 24</mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nadiruzzaman</surname><given-names>M.</given-names></name> <name><surname>Rahman</surname><given-names>M.</given-names></name> <name><surname>Pal</surname><given-names>U.</given-names></name> <name><surname>Croxton</surname><given-names>S.</given-names></name> <name><surname>Rashid</surname><given-names>M. B.</given-names></name> <name><surname>Bahadur</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Impact of climate change on cotton production in Bangladesh</article-title>. <source>Sustainability</source> <volume>13</volume>:<fpage>574</fpage>. doi: <pub-id pub-id-type="doi">10.3390/su13020574</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nigam</surname><given-names>S.</given-names></name> <name><surname>Jain</surname><given-names>R.</given-names></name> <name><surname>Marwaha</surname><given-names>S.</given-names></name> <name><surname>Arora</surname><given-names>A.</given-names></name> <name><surname>Haque</surname><given-names>M. A.</given-names></name> <name><surname>Dheeraj</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Deep transfer learning model for disease identification in wheat crop</article-title>. <source>Ecol. Informatics</source> <volume>75</volume>:<fpage>102068</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ecoinf.2023.102068</pub-id></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Patil</surname><given-names>B. M.</given-names></name> <name><surname>Burkpalli</surname><given-names>V.</given-names></name></person-group> (<year>2021</year>). <article-title>A perspective view of cotton leaf image classification using machine learning algorithms using weka</article-title>. <source>Adv. Hum.-Comput. Interact.</source> <volume>2021</volume>:<fpage>19</fpage>.</mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Prakash</surname><given-names>D.</given-names></name> <name><surname>Managuli</surname><given-names>M.</given-names></name></person-group> (<year>2024a</year>). <article-title>Spectrally efficient DWDM system using DQPSK modulation</article-title>. <source>J. Opt. Commun.</source> <volume>46</volume>:<fpage>163</fpage>. doi: <pub-id pub-id-type="doi">10.1515/joc-2024-0163</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Prakash</surname><given-names>D.</given-names></name> <name><surname>Managuli</surname><given-names>M.</given-names></name></person-group> (<year>2024b</year>). <article-title>An optimization approach to DWDM network reconfiguration through reinforcement learning</article-title>. <source>SN Comput. Sci.</source> <volume>5</volume>:<fpage>1069</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s42979-024-03438-4</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Safari</surname><given-names>B.</given-names></name> <name><surname>Alborzi</surname><given-names>Y.</given-names></name> <name><surname>Najafi</surname><given-names>E.</given-names></name></person-group> (<year>2022</year>). <article-title>Automated wheat disease detection using a ROS-based autonomous guided UAV</article-title>. <source>arXiv</source>:<fpage>220615042</fpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2206.15042</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saleem</surname><given-names>R. M.</given-names></name> <name><surname>Kazmi</surname><given-names>R.</given-names></name> <name><surname>Bajwa</surname><given-names>I. S.</given-names></name> <name><surname>Ashraf</surname><given-names>A.</given-names></name> <name><surname>Ramzan</surname><given-names>S.</given-names></name> <name><surname>Anwar</surname><given-names>W.</given-names></name></person-group> (<year>2021</year>). <article-title>Iot based cotton whitefly prediction using deep learning</article-title>. <source>Sci. Program.</source> <volume>2021</volume>:<fpage>17</fpage>. doi: <pub-id pub-id-type="doi">10.1155/2021/8824601</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Simonyan</surname><given-names>K.</given-names></name> <name><surname>Zisserman</surname><given-names>A.</given-names></name></person-group> (<year>2014</year>). <article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <source>arXiv</source>:<fpage>14091556</fpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1409.1556</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Szegedy</surname><given-names>C</given-names></name> <name><surname>Vanhoucke</surname><given-names>V</given-names></name> <name><surname>Ioffe</surname><given-names>S</given-names></name> <name><surname>Shlens</surname><given-names>J</given-names></name> <name><surname>Wojna</surname><given-names>Z</given-names></name></person-group> (<year>2016</year>). &#x201C;Rethinking the inception architecture for computer vision.&#x201D; in: <italic>Proceedings of the IEEE conference on computer vision and pattern recognition</italic>. pp. 2818&#x2013;2826</mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Talukder</surname><given-names>M. A.</given-names></name> <name><surname>Islam</surname><given-names>M. M.</given-names></name> <name><surname>Uddin</surname><given-names>M. A.</given-names></name> <name><surname>Akhter</surname><given-names>A.</given-names></name> <name><surname>Hasan</surname><given-names>K. F.</given-names></name> <name><surname>Moni</surname><given-names>M. A.</given-names></name></person-group> (<year>2022</year>). <article-title>Machine learning-based lung and colon cancer detection using deep feature extraction and ensemble learning</article-title>. <source>Expert Syst. Appl.</source> <volume>205</volume>:<fpage>117695</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2022.117695</pub-id></mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Talukder</surname><given-names>M. A.</given-names></name> <name><surname>Islam</surname><given-names>M. M.</given-names></name> <name><surname>Uddin</surname><given-names>M. A.</given-names></name> <name><surname>Akhter</surname><given-names>A.</given-names></name> <name><surname>Pramanik</surname><given-names>M. A. J.</given-names></name> <name><surname>Aryal</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>An efficient deep learning model to categorize brain tumor using reconstruction and fine-tuning</article-title>. <source>Expert Syst. Appl.</source> <volume>230</volume>:<fpage>120534</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2023.120534</pub-id></mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Tan</surname><given-names>M.</given-names></name> <name><surname>Le</surname><given-names>Q.</given-names></name></person-group> (<year>2019</year>). &#x201C;Efficientnet: rethinking model scaling for convolutional neural networks.&#x201D; in <italic>International conference on machine learning. PMLR</italic>, pp. 6105&#x2013;6114.</mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Thakur</surname><given-names>P. S.</given-names></name> <name><surname>Khanna</surname><given-names>P.</given-names></name> <name><surname>Sheorey</surname><given-names>T.</given-names></name> <name><surname>Ojha</surname><given-names>A.</given-names></name></person-group> (<year>2021</year>). &#x201C;<article-title>Vision transformer for plant disease detection: plant ViT</article-title>&#x201D; in <source>International conference on computer vision and image processing</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>501</fpage>&#x2013;<lpage>511</lpage>.</mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Uddin</surname><given-names>M. A.</given-names></name> <name><surname>Islam</surname><given-names>M. M.</given-names></name> <name><surname>Talukder</surname><given-names>M. A.</given-names></name> <name><surname>Hossain</surname><given-names>M. A. A.</given-names></name> <name><surname>Akhter</surname><given-names>A.</given-names></name> <name><surname>Aryal</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Machine learning based diabetes detection model for false negative reduction</article-title>. <source>Biomed. Mater. Devices</source> <volume>2</volume>, &#x2013;<lpage>443</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s44174-023-00104-w</pub-id></mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Vaswani</surname><given-names>A.</given-names></name> <name><surname>Shazeer</surname><given-names>N.</given-names></name> <name><surname>Parmar</surname><given-names>N.</given-names></name> <name><surname>Uszkoreit</surname><given-names>J.</given-names></name> <name><surname>Jones</surname><given-names>L.</given-names></name> <name><surname>Gomez</surname><given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2017</year>). &#x201C;<article-title>Attention is all you need</article-title>&#x201D; in <source>Proceedings of the 31st international conference on neural information processing systems (NIPS&#x2019;17)</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>), <fpage>6000</fpage>&#x2013;<lpage>6010</lpage>.</mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname><given-names>H.</given-names></name> <name><surname>Cheng</surname><given-names>X.</given-names></name> <name><surname>Chen</surname><given-names>C.</given-names></name> <name><surname>Heidari</surname><given-names>A. A.</given-names></name> <name><surname>Liu</surname><given-names>J.</given-names></name> <name><surname>Cai</surname><given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Apple leaf disease recognition method with improved residual network</article-title>. <source>Multimed. Tools Appl.</source> <volume>81</volume>:<fpage>81</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s11042-022-11915-2</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname><given-names>S.</given-names></name> <name><surname>Xie</surname><given-names>L.</given-names></name> <name><surname>Huang</surname><given-names>Q.</given-names></name></person-group> (<year>2023</year>). <article-title>Inception convolutional vision transformers for plant disease identification</article-title>. <source>Internet Things</source> <volume>21</volume>:<fpage>100650</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.iot.2022.100650</pub-id></mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname><given-names>D.</given-names></name> <name><surname>Tan</surname><given-names>J.</given-names></name> <name><surname>Wu</surname><given-names>C.</given-names></name> <name><surname>Yung</surname><given-names>K.</given-names></name> <name><surname>Ip</surname><given-names>A. W.</given-names></name></person-group> (<year>2023</year>). <article-title>Crop disease identification by fusing multiscale convolution and vision transformer</article-title>. <source>Sensors</source> <volume>23</volume>:<fpage>6015</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s23136015</pub-id>, <pub-id pub-id-type="pmid">37447864</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1568126/overview">Xing Yang</ext-link>, Anhui Science and Technology University, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3169376/overview">Malithi De Silva</ext-link>, University of Kelaniya, Sri Lanka</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3293986/overview">Madhu Bala</ext-link>, Lovely Professional University, India</p>
</fn>
</fn-group>
</back>
</article>