<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1644271</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Cross-scale detection and cross-crop generalization verification of tomato diseases in complex agricultural environments</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Jinghuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2805580/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Jinying</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Heyang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Information and Technology, Jilin Agricultural University</institution>, <addr-line>Changchun</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>College of Horticulture, Jilin Agricultural University</institution>, <addr-line>Changchun</addr-line>,&#xa0;<country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1700583/overview">Chunlei Xia</ext-link>, Chinese Academy of Sciences (CAS), China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2691701/overview">Anjan Debnath</ext-link>, Khulna University of Engineering &amp; Technology, Bangladesh</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2812266/overview">Sreedevi Alampally</ext-link>, K L University, India</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Jinying Li, <email xlink:href="mailto:lijinying@jlau.edu.cn">lijinying@jlau.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>27</day>
<month>10</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1644271</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Hu, Li and Wang.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Hu, Li and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>In order to overcome the key challenges associated with detecting tomato leaf disease in complex agricultural environments, such as leaf occlusion, variation in lesion size and light interference, this study presents a lightweight detection model called ToMASD. This model integrates multi-scale feature decoupling and an adaptive alignment mechanism. The model innovatively comprises a dual-branch adaptive alignment module (TAAM) that achieves cross-scale lesion semantic alignment via a dynamic feature pyramid, a local context-aware gated unit (Faster-GLUDet) that uses a spatial attention mechanism to suppress background noise interference, and a multi-scale decoupling detection head (MDH) that balances the detection accuracy of small and diffuse lesions. On a dataset containing six types of disease under various weather conditions, ToMASD achieves an average precision of 84.3%,.by a margin of 4.7% to 12.1% over thirteen mainstream models. The computational load is compressed to 7.1 GFLOPs. Through the introduction of a transfer learning paradigm, the pre-trained weights of the tomato disease detection model can be transferred to common bean and potato detection tasks. Through domain adaptation layers and adversarial feature decoupling strategies, the domain shift problem is overcome, achieving an average precision of 92.7% on the target crop test set. False detection rates in foggy and strong light conditions are controlled at 6.3% and 9.8%, respectively. This study achieves dual breakthroughs in terms of both high-precision detection in complex scenarios and the cross-crop generalization ability of lightweight models. It provides a new paradigm for universal agricultural disease monitoring systems that can be deployed at the edge.</p>
</abstract>
<kwd-group>
<kwd>tomato leaf disease</kwd>
<kwd>precision agriculture</kwd>
<kwd>agricultural artificial intelligence</kwd>
<kwd>multi-scale detection</kwd>
<kwd>transfer learning</kwd>
</kwd-group>
<counts>
<fig-count count="14"/>
<table-count count="6"/>
<equation-count count="6"/>
<ref-count count="21"/>
<page-count count="17"/>
<word-count count="6863"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Global agricultural diseases cause over 220 billion US dollars in economic losses each year. Among these, the early detection of leaf diseases in crops is a key part of precision plant protection (<xref ref-type="bibr" rid="B1">Astani et&#xa0;al., 2022</xref>). As an important economic crop cultivated worldwide, the control of tomato diseases directly affects crop yield, quality, and agricultural economic benefits. Although deep learning-based detection methods have made significant progress in single-crop scenarios, models generally perform poorly in cross-crop scenarios due to domain specificity and the scarcity of labelled data. This limitation is particularly evident in developing countries with limited resources: small-scale farmers cannot afford the development costs of multi-crop-specific models, and agricultural expert systems lack the ability to generalise heterologous disease features. This results in delayed disease warnings and the waste of prevention and control resources. Therefore, innovating and applying leaf disease detection technology is important for ensuring food security, promoting sustainable agricultural development, and advancing smart agricultural systems.</p>
<p>In natural farmland environments, leaf disease detection often faces multiple challenges, such as leaves shading each other, disease patches having diverse shapes, light conditions fluctuating and diseases having weak early symptoms. Traditional image processing methods mostly rely on colour space segmentation combined with texture feature extraction for classification. For example, Javidan et&#xa0;al. used K-means clustering to segment diseased areas and applied morphological operations to optimise edge detection, achieving a 98.97% accuracy rate under controlled lighting conditions (<xref ref-type="bibr" rid="B7">Javidan et&#xa0;al., 2023</xref>). Similarly, Bhagat et&#xa0;al. used a network search-based SVM for classification and detection of plant leaf diseases (<xref ref-type="bibr" rid="B3">Bhagat et&#xa0;al., 2020</xref>), while Rodr&#xed;guez et&#xa0;al. collected potato canopy images using a drone equipped with a multispectral sensor (<xref ref-type="bibr" rid="B14">Rodr&#xed;guez et&#xa0;al., 2021</xref>). They combined vegetation indices and machine learning algorithms to achieve early detection and severity assessment of potato late blight. Furthermore, Saleem et&#xa0;al. designed a leaf segmentation process based on the ExG index and the region-growing method and combined the proportion of the diseased area to assess severity (<xref ref-type="bibr" rid="B15">Saleem et&#xa0;al., 2024</xref>). However, traditional methods face insufficient generalization ability in complex farmland scenarios due to their heavy reliance on manual feature design and experience-driven parameter tuning, and are difficult to adapt to the multi-scale disease representation requirements in dynamic field environments.</p>
<p>The advent of CNN has precipitated a paradigm shift in the realm of agricultural disease detection, with end-to-end architectures predicated on single-stage detectors becoming the prevailing paradigm. This is primarily attributable to the enhanced efficiency exhibited by these architectures. In the context of tomato leaf disease detection, researchers frequently employ a combination of deep learning models and conventional image processing techniques to enhance the precision of lesion localization. For instance, Barbedo proposed a threshold segmentation approach based on the HSV color space and morphological processing to extract lesion areas and verified the feasibility of combining traditional methods with CNN (Convolutional Neural Network) (<xref ref-type="bibr" rid="B2">Barbedo, 2018</xref>). Similarly, R. et&#xa0;al. embedded an attention mechanism in a pre-trained residual CNN, combined with multi-spectral data to enhance lesion feature expression, improving the discrimination of lesion features in complex environments (<xref ref-type="bibr" rid="B12">R. et&#xa0;al., 2020</xref>). Furthermore, Cong et&#xa0;al. developed a lightweight Mask R-CNN variant, optimizing lesion boundary localization through the integration of superpixel segmentation and edge detection algorithms (<xref ref-type="bibr" rid="B5">Cong et&#xa0;al., 2023</xref>). As proposed by Shin et&#xa0;al., a feature extraction and data augmentation strategy was proposed, combining a CNN with RGB images (<xref ref-type="bibr" rid="B18">Shin et&#xa0;al., 2021</xref>). This strategy achieved an average accuracy of 92.18% in the detection of strawberry leaf powdery mildew. Despite the efficacy of the aforementioned method in certain contexts, it remains confronted with numerous challenges in the context of natural farmlands. The distinguishing characteristics of disease spots are often obscured by leaf occlusion in complex backgrounds, leading to ambiguity in identification (<xref ref-type="bibr" rid="B6">Debnath et&#xa0;al., 2023</xref>)This study adopts EfficientNetV2B2 as the lightweight backbone network to achieve efficient and accurate disease identification. sing the DL approach, tomato leaf disease identification achieves nearly 100% accuracy on a test dataset. Additionally, the presence of similar diseases can result in confusion regarding texture, and the identification of early disease spots with low contrast can be challenging (<xref ref-type="bibr" rid="B15">Saleem et&#xa0;al., 2024</xref>). Furthermore, the method&#x2019;s accuracy in distinguishing cross-diseases with similar symptoms is often limited.</p>
<p>Currently, the field of plant disease detection generally faces the bottleneck of model generalization caused by domain differences. Existing research is mostly limited to customized training for single-crop diseases and is difficult to effectively transfer to heterologous crops. The detection of tomato leaf diseases in real-world agricultural settings is hindered by several key challenges: leaf occlusion and overlap.Different diseases share visual characteristics, leading to misclassification. To address this issue, this study proposes a cross-crop transfer learning framework that breaks through the domain shift limitations of cross-species disease recognition by sharing low-level feature representations and domain adaptation optimization strategies. Specifically, a CNN backbone model is trained with a tomato leaf disease dataset, and the transfer learning framework freezes the shallow feature extraction layers to retain the common texture and morphological features of crops and adapt to the specific phenotypes of target crop diseases, combined with adversarial training to minimize the distribution differences between domains. This achievement provides a cross-crop transfer learning paradigm for building a universal plant disease intelligent monitoring system and promotes the large-scale application of precision plant protection technology.</p>
<p>The primary contributions of this paper are as follows:</p>
<list list-type="order">
<list-item>
<p>In order to address the challenges posed by the attenuation of features in small target disease spots and the failure to detect early disease spots, a novel dual-branch adaptive alignment module has been designed. Through dynamic feature alignment and cross-scale feature interaction, it significantly improves the detection accuracy and robustness of tomato leaf diseases in complex agricultural environments.</p>
</list-item>
<list-item>
<p>The Faster-GLUDet feature enhancement unit was integrated, which employs partial convolution and local context-aware gating mechanisms. This enhancement to the model&#x2019;s noise suppression capabilities is achieved while maintaining its lightweight nature.</p>
</list-item>
<list-item>
<p>The construction of a multi-scale decoupled detection head was undertaken. The model achieves balanced detection of cross-scale diseases and efficient distinction between small disease spots, spreading lesions, and mixed diseases through hierarchical feature fusion and Group Normalization optimization.</p>
</list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data processing</title>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Data source</title>
<p>The tomato leaf disease dataset used in this study was sourced from the &#x201c;Tomato Leaf Diseases Detect&#x201d; standardized dataset released by the Roboflow open platform. It contains six typical disease categories (bacterial spot, early blight, late blight, leaf mold, target spot, and black spot) and healthy leaf samples, covering the early, middle, and late stages of disease development. In total, it includes 3,469 high-resolution RGB images. The six tomato leaf diseases of interest in this study are highly prevalent in major tomato-growing regions worldwide, causing yield losses of 20% to 65% (<xref ref-type="bibr" rid="B10">Lu et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B11">Panno et&#xa0;al., 2021</xref>). <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> provides a detailed breakdown of the final image distribution across all categories after augmentation and splitting. The common Bean Dataset was captured at the Guoxin Modern Agricultural Base in Changchun City, Jilin Province, and the public dataset Bean Disease Dataset. The Potato dataset is from the public Potato disease dataset which includes three categories: health, early disease and late disease (<ext-link ext-link-type="uri" xlink:href="https://gitcode.com/open-source-toolkit/829ec">https://gitcode.com/open-source-toolkit/829ec</ext-link>). The inclusion of these diverse datasets from different crops is intended to rigorously validate the transferability of the features learned by our model from tomatoes to other commercially important crops.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Data distribution.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Data distribution</th>
<th valign="middle" align="center">Training set</th>
<th valign="middle" align="center">Validation set</th>
<th valign="middle" align="center">Test set</th>
<th valign="middle" align="center">All</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Bacterial Spot</td>
<td valign="middle" align="center">2841</td>
<td valign="middle" align="center">465</td>
<td valign="middle" align="center">221</td>
<td valign="middle" align="center">3527</td>
</tr>
<tr>
<td valign="middle" align="center">Early Blight</td>
<td valign="middle" align="center">4994</td>
<td valign="middle" align="center">618</td>
<td valign="middle" align="center">669</td>
<td valign="middle" align="center">6281</td>
</tr>
<tr>
<td valign="middle" align="center">Healthy</td>
<td valign="middle" align="center">1621</td>
<td valign="middle" align="center">227</td>
<td valign="middle" align="center">271</td>
<td valign="middle" align="center">2119</td>
</tr>
<tr>
<td valign="middle" align="center">Late Blight</td>
<td valign="middle" align="center">2908</td>
<td valign="middle" align="center">408</td>
<td valign="middle" align="center">202</td>
<td valign="middle" align="center">3524</td>
</tr>
<tr>
<td valign="middle" align="center">Leaf Mold</td>
<td valign="middle" align="center">2871</td>
<td valign="middle" align="center">274</td>
<td valign="middle" align="center">361</td>
<td valign="middle" align="center">3506</td>
</tr>
<tr>
<td valign="middle" align="center">Target Spot</td>
<td valign="middle" align="center">2296</td>
<td valign="middle" align="center">281</td>
<td valign="middle" align="center">244</td>
<td valign="middle" align="center">2821</td>
</tr>
<tr>
<td valign="middle" align="center">Black Spot</td>
<td valign="middle" align="center">3710</td>
<td valign="middle" align="center">335</td>
<td valign="middle" align="center">143</td>
<td valign="middle" align="center">4418</td>
</tr>
<tr>
<td valign="middle" align="center">All</td>
<td valign="middle" align="center">21241</td>
<td valign="middle" align="center">2608</td>
<td valign="middle" align="center">2111</td>
<td valign="middle" align="center">25960</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Data enhancement</title>
<p>To enhance the robustness of the model, the dataset was expanded to 7370 images using data augmentation techniques. In order to prevent the original image and enhanced image from appearing simultaneously in the training set and validation set, the original image is initially divided into a training set, validation set, and test set in a ratio of approximately 8:1:1. Subsequently, five techniques including horizontal flipping, vertical flipping, grayscale, contrast adjustment, and brightness adjustment were randomly applied to the original data to enhance the image and labels. The enhanced example image is shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>, and the label distribution is shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Data enhancement methods for tomato leaf images: <bold>(a)</bold> original image; <bold>(b)</bold> horizontal flip; <bold>(c)</bold> vertical flip; <bold>(d)</bold> grayscale conversion; <bold>(e)</bold> contrast adjustment; <bold>(f)</bold> brightness adjustment.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g001.tif">
<alt-text content-type="machine-generated">A collage showing six images of a single leaf with various transformations. (a) Original figure in color. (b) Horizontal flip. (c) Vertical flip. (d) Grayscale version. (e) Contrast adjustment. (f) Brightness adjustment. Each transformation alters the appearance of the leaf slightly.</alt-text>
</graphic>
</fig>
<p>In order to simulate the complex weather changes in the real tomato cultivation environment, this study adopts the RGB channel synthesis technology based on the atmospheric scattering model to generate enhanced images with controllable weather features on 50% of the typical samples in the training set. The synthesis formula is shown in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>.</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mi>J</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In this study, <inline-formula>
<mml:math display="inline" id="im1">
<mml:mi>x</mml:mi>
</mml:math>
</inline-formula> denotes the pixel coordinate, <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> signifies the synthesized image, <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents the original image, the transmittance map <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is constrained within the interval [0.2, 0.8] and controls the weather intensity gradient, and the atmospheric light value <inline-formula>
<mml:math display="inline" id="im5">
<mml:mi>A</mml:mi>
</mml:math>
</inline-formula> restricts the amplitude of illumination attenuation. <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> presents an image of medium-intensity synthetic weather.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Randomly select a leaf image for weather synthesis: <bold>(a)</bold> original image; <bold>(b)</bold> rain simulation; <bold>(c)</bold> fog simulation; <bold>(d)</bold> solar flare simulation; <bold>(e)</bold> overexposure simulation; <bold>(f)</bold> snow simulation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g002.tif">
<alt-text content-type="machine-generated">Six images of a leaf illustrating different weather and lighting conditions. (a) Original image of a leaf. (b) Leaf obscured by rain. (c) Leaf partially obscured by fog. (d) Leaf affected by solar flare. (e) Leaf with overexposure effects. (f) Leaf with falling snowflakes in the foreground.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Methodology</title>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>Tomato multi-scenario adaptive scale detector</title>
<p>While general-purpose detectors like YOLOv11 have demonstrated strong performance on common datasets, their inherent architecture is not optimally designed for the unique challenges presented by complex agricultural environments, such as severe scale variation of lesions, leaf occlusions, and pervasive background noise. These limitations often lead to feature misalignment, reduced sensitivity to small objects, and compromised robustness under fluctuating lighting conditions.</p>
<p>To address these specific issues, we propose the Tomato Multi-scenario Adaptive Scale Detector (ToMASD), a novel lightweight architecture specifically engineered for high-precision disease detection in real-world field settings. The overarching design philosophy of ToMASD is to achieve an optimal balance between computational efficiency and detection accuracy by introducing three dedicated core modules that work in concert throughout the feature extraction and fusion pipeline.</p>
<p>As illustrated in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>, the Two-branch Adaptive Alignment Module (TAAM) is integrated into the backbone network. Its purpose is to dynamically align and calibrate multi-scale features at the earliest stage, effectively mitigating the semantic misalignment between healthy and diseased tissue regions caused by occlusion and scale variance. The Faster-Gated Linear Unit (Faster-GLUDet) is embedded within the neck network. This module acts as an adaptive feature refiner, leveraging a gating mechanism to suppress irrelevant background noise. The Multi-scale Decoupling Head (MDH) is designed as the detection head. It replaces the conventional coupled head with a decoupled structure, allowing for independent optimization for classification and regression tasks at different feature scales. This synergistic design ensures that ToMASD is uniquely capable of handling the complexities of agricultural disease detection.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>ToMASD model structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g003.tif">
<alt-text content-type="machine-generated">Flowchart of a neural network model architecture consisting of three main sections: Backbone, Neck, and Head. The Backbone includes modules like TAAM, SPPF, and C2PSA, processing an input image. The Neck involves operations like Upsample, Concat, and Faster-GLUDet, interacting with CBS. The Head concludes with the MDH module, outputting an image featuring a plant leaf. Arrows indicate the flow of data between components.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Two-branch adaptive alignment module</title>
<p>The present study focuses on the problems of insufficient feature extraction of small-scale targets and redundant shallow computations in the YOLOv11 backbone for leaf lesion detection. It proposes a novel dual-branch adaptive alignment module, the Two-branch Adaptive Alignment Module (TAAM), as shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>. The module under discussion achieves efficient computation through a multi-level feature sharing architecture. Firstly, the Pointwise Spatial Attention Stem (PSAStem) is utilised as the shared initial extraction layer, which pre-calibrates the input features through 1&#xd7;1 pointwise convolution and an adaptive mechanism, thereby enabling the network to form dynamic spatial focusing capabilities at the input stage. Subsequently, the feature maps are processed through dual paths. The primary pathway integrates two C3k2 modules,each containing three standard 3&#xd7;3 convolutions with 64 output channels, and a standard 3&#xd7;3 convolution to preserve intricate features. The secondary pathway employs a 1&#xd7;1 dimension-reducing convolution (reducing channels by a factor of 2) and subsequently connects to the optimised PSABlock. The initial feature extraction module combines pointwise convolution and spatial attention mechanisms, enabling the network to prioritise key regions in the input image and enhance the dynamic focusing ability on key spatial regions in the input stage while maintaining computational efficiency.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>TAAM internal structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g004.tif">
<alt-text content-type="machine-generated">Diagram illustrating the TAAM model with pathways of blocks and connections. The flow begins with an input leading to P4AStem, followed by C3k2, Conv, PSABlock, AAM, and DWConv nodes. It shows connections for two configurations of C3k2 with c3k set to false and true, highlighting a split, bottleneck, and concatenate process within each section. The PSABlock is noted for its attention mechanism followed by dual convolution layers.</alt-text>
</graphic>
</fig>
<p>After the dual-branch channels process the features in parallel, they are connected to the Adaptive Alignment Module (AAM), as shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>. The input dual-channel features F<sub>1</sub> and F<sub>2</sub> are first compressed in the channel dimension through 1&#xd7;1 convolution layers to obtain F<sub>1</sub>&#x2019; and F<sub>2</sub>&#x2019;, and then adaptive alignment weights - Adaptive Align Weight are generated through the cross-branch feature interaction layer to dynamically balance the contribution of the two paths. They are input into a 3&#xd7;3 dilated convolution to capture long-range context dependencies and generate dynamic path selection weights &#x3b1; through the Sigmoid activation function. After spatial alignment of the two paths, the key region responses are enhanced through element-wise multiplication, as shown in <xref ref-type="disp-formula" rid="eq2">Equations 2</xref> and <xref ref-type="disp-formula" rid="eq3">3</xref>, where &#x3c3; represents the Sigmoid activation function and W_&#x3b1; is a learnable weight matrix. Finally, multi-scale feature complementarity is achieved through element-wise addition, and the results are merged and output.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>AAM structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g005.tif">
<alt-text content-type="machine-generated">Diagram of a computational process starting with two features. Each feature undergoes a 1x1 convolution, followed by concatenation. A 3x3 convolution and sigmoid function are applied, then split. Element-wise multiplication with parameters occurs, followed by element-wise summation. The process concludes with another 1x1 convolution. Key symbols indicate operations of summation and multiplication.</alt-text>
</graphic>
</fig>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
<mml:msub>
<mml:mo>_</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>&#x2297;</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mo>_</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2297;</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>'</mml:mo>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The innovation of this module can be attributed to the following: The following two mechanisms are introduced: 1) A dynamic weight adjustment mechanism is employed to optimise branch weights in real time based on the semantic distribution of the input features. This mechanism alleviates the spatial offset problem of heterogeneous features, such as misalignment interference between leaf lesions and healthy tissues. 2) Through the collaborative design of dilated convolution and gated attention, computational redundancy is reduced while local details and global pathological patterns are jointly modelled.</p>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Faster-gated linear unit</title>
<p>The neck network of YOLOv11 employs depthwise separable convolution and channel pruning strategies, which have been shown to enhance the recall rate of small targets while concurrently reducing the model&#x2019;s parameters. Tomato leaf diseases frequently manifest as minute spots, and the receptive field of the P5 layer in the feature pyramid is overly extensive, which may impede the learning of small target features. Secondly, when the brown necrotic spots of tomato late blight are similar in colour to the soil, the feature pyramid network may confuse the target with the background. In order to address these issues, we propose Faster-GLUDet, whose core lies in enhancing the model&#x2019;s ability to extract disease features in complex backgrounds through a gating mechanism while maintaining model lightweight. The Faster-GLUDet module integrates FasterNetBlock and Convolutional GLU (<xref ref-type="bibr" rid="B4">Chen et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B17">Shi, 2023</xref>), as illustrated in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Faster GLUDet module structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g006.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture. The input passes through a FasterNet Block leading to a Convolutional Gate Linear Unit. The FasterNet Block contains a series of operations: PConv 3x3, followed by two Conv 1x1 layers, with their outputs added together. The Convolutional Gate Linear Unit involves an input branching into two Linear operations, a DWConv 3x3, followed by an activation function, and a Linear operation, with outputs combined using addition operations.</alt-text>
</graphic>
</fig>
<p>The module&#x2019;s primary function is the extraction of feature units through the utilisation of FasterNet Block, employing a 3x3 Partial Convolution to extract spatial features from a quarter of the input channels. This approach results in a 25% reduction in computational load when compared with traditional convolution, while retaining edge detail information. Subsequently, two 1x1 convolutions are connected to perform channel dimension reduction and feature fusion, thereby ensuring the effective preservation of multi-scale disease features. In order to enhance the feature interaction between modules, a dynamic gated feature enhancement unit, known as the Convolutional Gated Linear Unit (ConvGLU), is connected after the Faster Block. The model employs a dual-branch convolution to generate feature maps and gating signals, dynamically suppressing background noise and enhancing the response in the lesion area through element-wise multiplication. In the gating branch of the traditional gated linear unit (GLU), ConvGLU decomposes the standard 3x3 convolution into a cascade structure of depthwise convolution (DWConv) and pointwise convolution (PWConv), and combines a linear projection layer and GELU activation function to construct a lightweight feature enhancement path. A lightweight 3x3 depthwise convolution operation is introduced prior to the activation function in order to construct a gating channel attention mechanism based on neighbourhood features. The design converts global channel attention into local context-aware dynamic weight adjustment through the local receptive field characteristics of the convolution kernel. This retains the important information filtering ability in the channel dimension while significantly reducing computational complexity. ConvGLU employs convolution operations to capture local features in the image, thereby enhancing its efficacy in processing local information in comparison to traditional FFN. It is also capable of adaptively enhancing small target features.</p>
<p>The integration of Faster-GLUDet has been demonstrated to enhance the model&#x2019;s feature extraction and expression capabilities to a considerable extent. The lightweight design of FasterBlock provides low-latency input for CGLU, while the gated weight generation module of CGLU further optimises multi-scale feature interaction, enabling the model to maintain lightweight while enhancing the diversity and hierarchy of features. This, in turn, helps better capture the details and context information of the target object. The system has been engineered to achieve dynamic regulation of feature channels, thereby further enhancing the semantic segmentation and spatial understanding capabilities of features. The combination of these two approaches has been shown to more effectively fuse multi-scale features and alleviate the problem of information loss, significantly improving the model&#x2019;s performance in challenging plant disease recognition tasks.</p>
</sec>
<sec id="s2_2_4">
<label>2.2.4</label>
<title>Multi-scale decoupling head</title>
<p>In order to address the issue of uneven detection accuracy of traditional detection heads for small and large-scale lesions, the Multi-scale Decoupling Head (MDH) has been proposed, as illustrated in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>. The core process is as follows: MDH receives three different-scale feature maps - P3, P4 and P5 - from the Feature Pyramid Network (FPN) in parallel, which respectively carry high-resolution details, medium-scale information and large receptive field context, thereby constructing a multi-scale perception foundation. The features of each scale first enter a unified feature enhancement pathway, which is composed of a series of grouped normalized convolutional modules: First, the channel dimension is adjusted and fused through a 1&#xd7;1 Conv_GN, and then two 3&#xd7;3 Conv_GN modules are continuously used to enhance the spatial feature expression. At the same time, group normalization is utilized to ensure the stability of the model under small-batch training. After feature enhancement, the network flow is completely decoupled into two independent branches dedicated to their respective functions: The classification branch precisely extracts features through two consecutive 1&#xd7;1 convolutional layers and ultimately outputs a probability graph with a dimension of nc, accurately determining the category of the target within each anchor box; The regression branch adopts the same structure, but its output dimension is 4 &#xd7; reg_max, which indicates that it uses an advanced distributed focus loss mechanism. By predicting the discrete distribution of bounding box coordinates, it greatly improves the accuracy of lesion location, where reg_max defines the flexible maximum value of the distribution. Ultimately, the outputs of the two branches are respectively normalized and integrated through a scale layer to generate the final detection results.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>MDH detection head structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g007.tif">
<alt-text content-type="machine-generated">Flowchart showing a network architecture. Three inputs, P3, P4, and P5, connect to a series of boxes labeled &#x201c;Conv_GN&#x201d; with dimensions 1x1 and 3x3. These connect to output layers labeled &#x201c;cls&#x201d; and &#x201c;4*reg_max&#x201d; with dimensions 1x1, followed by outputs labeled &#x201c;nc&#x201d; and &#x201c;scale.&#x201d; The diagram illustrates a convolutional network sequence.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Analysis of experimental results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Experimental platform and parameter settings</title>
<p>In this study, the image input size is set to 640&#xd7;640 pixels. To accelerate the convergence speed, the initial learning rate is set to 0.01, the stochastic gradient descent algorithm (SGD) is used for training, the weight decay coefficient is set to 0.0005, the momentum factor is set to 0.937, a total of 200 periods, and the size of the training batch is set to 32 times, and the workers are set to 12. All the experiments are performed on a Linux server. All the experiments are realized on a Linux server, and the specific configuration of the experimental environment is shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Experimental environment configuration.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Environment Configuration</th>
<th valign="middle" align="center">Parameters</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">GPU</td>
<td valign="middle" align="left">2*A100(80GB)</td>
</tr>
<tr>
<td valign="middle" align="left">CPU</td>
<td valign="middle" align="left">Intel(R)Xeon(R)Gold 6148 CPU @2.40GHz</td>
</tr>
<tr>
<td valign="middle" align="left">Development environment</td>
<td valign="middle" align="left">PyCharm 2023.2.5</td>
</tr>
<tr>
<td valign="middle" align="left">Language</td>
<td valign="middle" align="left">Python 3.8.10</td>
</tr>
<tr>
<td valign="middle" align="left">Framework</td>
<td valign="middle" align="left">PyTorch 2.0.1</td>
</tr>
<tr>
<td valign="middle" align="left">Operating platform</td>
<td valign="middle" align="left">CUDA 11.8</td>
</tr>
<tr>
<td valign="middle" align="left">Operating System</td>
<td valign="middle" align="left">Linux</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Analysis and evaluation of the identification results</title>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Evaluation index</title>
<p>In this paper, the metrics Precision, Recall, and mAP are utilised to evaluate the detection performance of the model. TP, FP, and FN represent the number of true positive, false positive, and false negative samples, respectively. C denotes the set of object categories, and |C| is the total number of categories. As shown in <xref ref-type="disp-formula" rid="eq4">Equations 4</xref>, <xref ref-type="disp-formula" rid="eq5">5</xref> and <xref ref-type="disp-formula" rid="eq6">6</xref>
</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>P denotes the proportion of correctly detected disease samples among all positive detections, thereby reflecting the model&#x2019;s capacity to avoid false positives. R signifies the proportion of correctly detected disease samples among the actual existing disease samples, thus measuring the model&#x2019;s ability to reduce false negatives. AP quantifies the detection performance of the model for a single disease category by calculating the area under the precision-recall curve. mAP is the average of APs for all categories, and a higher mAP indicates that the model&#x2019;s detection effect on various disease categories is more balanced and accurate.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Comparative experiments of different models</title>
<p>To comprehensively evaluate the detection and generalization performance of the proposed ToMASD model, we conducted extensive comparative experiments with thirteen state-of-the-art object detection models on the same tomato leaf disease dataset. As summarized in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>, ToMASD achieved the highest scores in both precision and mAP, significantly outperforming all other contenders. RT-DETR achieves high accuracy but at the cost of high computational complexity, making it unsuitable for edge deployment. In comparison with the unimproved YOLOv11n, ToMASD has increased P, mAP, and Recall by 6.6%, 7.8%, and 5.9%, respectively, demonstrating its superior ability in target localization and classification in complex scenarios. Despite the fact that YOLOv11n exhibits a modestly diminished number of FLOPs in comparison with ToMASD, a notable deterioration in accuracy is evident, suggesting the potential for optimisation shortcomings within the feature ex-traction process. Despite the advantages in parameter quantity and computational cost of YOLOv5n and YOLOv11n, their accuracy still lags significantly behind ToMASD, further confirming the dual improvements in accuracy and recognition performance of the ToMASD model. Whilst sustaining its position of being lightweight, it has considerably surpassed the constraints of prevailing algorithms in the trade-off between ac-curacy and computational resource consumption. As illustrated in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>, a performance comparison of the six models with the highest mAP is presented. A selection of six algorithms with comparable performance was made for the purpose of a comprehensive comparison, as illustrated in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>. The performance of the metric is optimised by the distance of each axis of the curve from the intersection point. The area enclosed by the curve is positively correlated with the strength of the algorithm&#x2019;s comprehensive performance. The comparison results demonstrate that the ToMASD model proposed in this paper exhibits advantages in all metrics, not only improving performance but also achieving lightweight, thus rendering it more suitable for practical scenarios.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of object detection results of different algorithms.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Models</th>
<th valign="middle" align="center">P%</th>
<th valign="middle" align="center">mAP%</th>
<th valign="middle" align="center">Recall%</th>
<th valign="middle" align="center">FLOPs/G</th>
<th valign="middle" align="center">Parameters</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">SSD (<xref ref-type="bibr" rid="B9">Liu et&#xa0;al., 2016</xref>)</td>
<td valign="middle" align="center">76.5</td>
<td valign="middle" align="center">72.3</td>
<td valign="middle" align="center">70.7</td>
<td valign="middle" align="center">200.6</td>
<td valign="middle" align="center">4.48&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv3-tiny (<xref ref-type="bibr" rid="B13">Redmon and Farhadi, 2018</xref>)</td>
<td valign="middle" align="center">73.6</td>
<td valign="middle" align="center">66.8</td>
<td valign="middle" align="center">61.1</td>
<td valign="middle" align="center">18.9</td>
<td valign="middle" align="center">1.21&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5n</td>
<td valign="middle" align="center">74.5</td>
<td valign="middle" align="center">71.3</td>
<td valign="middle" align="center">69.1</td>
<td valign="middle" align="center">4.2</td>
<td valign="middle" align="center">1.76&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv6 (<xref ref-type="bibr" rid="B8">Li et&#xa0;al., 2022</xref>)</td>
<td valign="middle" align="center">69.1</td>
<td valign="middle" align="center">68.2</td>
<td valign="middle" align="center">65.6</td>
<td valign="middle" align="center">11.1</td>
<td valign="middle" align="center">4.23&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv7-tiny (<xref ref-type="bibr" rid="B19">Wang et&#xa0;al., 2022</xref>)</td>
<td valign="middle" align="center">70.0</td>
<td valign="middle" align="center">69.4</td>
<td valign="middle" align="center">68.9</td>
<td valign="middle" align="center">13.2</td>
<td valign="middle" align="center">6.07&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8n</td>
<td valign="middle" align="center">73.8</td>
<td valign="middle" align="center">72.5</td>
<td valign="middle" align="center">68.8</td>
<td valign="middle" align="center">8.7</td>
<td valign="middle" align="center">3.00&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8s</td>
<td valign="middle" align="center">77.8</td>
<td valign="middle" align="center">75.6</td>
<td valign="middle" align="center">77.5</td>
<td valign="middle" align="center">28.6</td>
<td valign="middle" align="center">1.12&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv9t (<xref ref-type="bibr" rid="B21">Wang et&#xa0;al., 2024b</xref>)</td>
<td valign="middle" align="center">69.1</td>
<td valign="middle" align="center">67.8</td>
<td valign="middle" align="center">70.1</td>
<td valign="middle" align="center">7.9</td>
<td valign="middle" align="center">2.01&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv9s</td>
<td valign="middle" align="center">73.9</td>
<td valign="middle" align="center">71.5</td>
<td valign="middle" align="center">72.4</td>
<td valign="middle" align="center">26.7</td>
<td valign="middle" align="center">7.17&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10n (<xref ref-type="bibr" rid="B20">Wang et&#xa0;al., 2024a</xref>)</td>
<td valign="middle" align="center">76.2</td>
<td valign="middle" align="center">70.8</td>
<td valign="middle" align="center">67.9</td>
<td valign="middle" align="center">8.2</td>
<td valign="middle" align="center">2.69&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv11n</td>
<td valign="middle" align="center">74.9</td>
<td valign="middle" align="center">73.9</td>
<td valign="middle" align="center">74.7</td>
<td valign="middle" align="center">6.7</td>
<td valign="middle" align="center">2.76&#xd7;10<sup>6</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv11s</td>
<td valign="middle" align="center">77.7</td>
<td valign="middle" align="center">75.4</td>
<td valign="middle" align="center">77.8</td>
<td valign="middle" align="center">9.4</td>
<td valign="middle" align="center">2.15&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">RT-DETR</td>
<td valign="middle" align="center">80.1</td>
<td valign="middle" align="center">77.6</td>
<td valign="middle" align="center">78.9</td>
<td valign="middle" align="center">56.9</td>
<td valign="middle" align="center">3.27&#xd7;10<sup>7</sup>
</td>
</tr>
<tr>
<td valign="middle" align="center">ToMASD</td>
<td valign="middle" align="center">84.3</td>
<td valign="middle" align="center">81.7</td>
<td valign="middle" align="center">80.6</td>
<td valign="middle" align="center">7.1</td>
<td valign="middle" align="center">2.46&#xd7;10<sup>6</sup>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Performance bar charts of six models.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g008.tif">
<alt-text content-type="machine-generated">Bar chart comparing the performance of different models: YOLOv5n, YOLOv8n, YOLOv9s, YOLOv11n, YOLOv11s, RT-DETR, and ToMASD. Metrics shown are P%, mAP%, and FLOPs(G). All models show high P% and mAP% values between 60 and 100, while FLOPs(G) values vary significantly, with RT-DETR showing the highest value.</alt-text>
</graphic>
</fig>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Comprehensive comparison of performance of six models.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g009.tif">
<alt-text content-type="machine-generated">Radar chart comparing object detection models YOLOv5n, YOLOv8n, YOLOv9s, YOLOv11n, YOLOv11s, RT-DETR, and ToMASD across five metrics: Precision, mAP, Recall, Parameters, and FLOPs. Models are represented in different colors as indicated by the legend.</alt-text>
</graphic>
</fig>
<p>To verify the performance of the TAAM module, this study systematically evaluated the effects of four mainstream attention modules as the Stem, as shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. PSA achieved the best balance between accuracy and computational efficiency, reducing the computational complexity by 17.7% compared to the ECA module with the sec-ond-highest accuracy, verifying the performance of PSA.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Comparison of different attention modules.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">PSA</th>
<th valign="middle" align="left">CBAM</th>
<th valign="middle" align="left">ECA</th>
<th valign="middle" align="left">ELAN</th>
<th valign="middle" align="left">P%</th>
<th valign="middle" align="left">FLOPs/G</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left"/>
<td valign="middle" align="left"/>
<td valign="middle" align="left"/>
<td valign="middle" align="left">77.4</td>
<td valign="middle" align="left">20.9</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left"/>
<td valign="middle" align="left"/>
<td valign="middle" align="left">72.8</td>
<td valign="middle" align="left">20.9</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left"/>
<td valign="middle" align="left">76.2</td>
<td valign="middle" align="left">25.4</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left"/>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">73.6</td>
<td valign="middle" align="left">23.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref> shows the detection performance of the five models with the highest accuracy under different weather conditions, where gray boxes indicate missed detec-tions and black boxes indicate false positives. Under foggy conditions, except for To-MASD, the other four models misjudged fog points as diseases. When the light intensity was too high, the comparison models also failed to effectively suppress the exposed areas, resulting in missed detections of some real lesions. <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref> shows the detection results of the four lightweight models in conventional environments. Comprehensive comparative analysis shows that ToMASD exhibits the most superior performance under various complex weather conditions and has superior feature extraction and denoising capabilities in real agricultural environments. <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref> shows ToMASD&#x2019;s detection of potato and common bean leaf diseases, and the experimental results show that ToMASD maintains high accuracy in the cross-crop task, achieving 92.1% and 93.5% accuracy in the detection of bean and potato leaf diseases, respectively, demonstrating its efficient generalization ability and transferability nature of cross-species training. <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> shows the recognition accuracy of ToMASD for different spots, <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref> shows the confusion matrix of this experiment, the model still maintains stable recognition performance in the category imbalance dataset, the recognition accuracy of Late Blight, Early Blight is close to 90%, Leaf Mold has a similar chromaticity of the yellow spot and the healthy tissues, which leads to the relatively low detection accuracy. Bacterial Spot and Target Spot have similar water-damaged spot characteristics, but the model still achieved 82.1% and 74.1% mAP values through multi-scale texture analysis, indicating the effectiveness of the feature decoupling mechanism.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Detection results of five models under different weather conditions (gray boxes indicate missed detections and black boxes indicate false positives).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g010.tif">
<alt-text content-type="machine-generated">Grid of images comparing various YOLO and RT-DETR object detection models. Each row represents a model version: YOLOv8s, YOLOv10n, YOLOv11s, RT-DETR, and ToMASD. Images include leaves with overlapping boxes in different colors indicating detection results, along with numerical confidence scores in various hues.</alt-text>
</graphic>
</fig>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Detection results of the four models in a conventional detection environment.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g011.tif">
<alt-text content-type="machine-generated">Grid of images comparing plant leaf detection across four models: YOLOv8n, YOLOv10n, YOLOv11n, and ToMASD. Each model produces different bounding boxes and confidence scores in four rows, indicating variations in object detection performance on the leaves.</alt-text>
</graphic>
</fig>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>The detection results of leaf diseases of common beans and potatoes by ToMASD: <bold>(a)</bold> bean disease, <bold>(b)</bold> tomato disease. The models from left to right are YOLOv8n, YOLOv10n, YOLOv11n.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g012.tif">
<alt-text content-type="machine-generated">Images showing bean and potato leaf diseases detected by different machine learning models: YOLOv8n, YOLOv10n, YOLOv11n, and ToMASD. The bean images illustrate viral and rust infections, while the potato images display early and general late blight. Each row represents the same leaf, showcasing various detection confidence levels.</alt-text>
</graphic>
</fig>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Evaluation indicators for different diseases.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Evaluation index</th>
<th valign="middle" align="center">Bacterial Spot</th>
<th valign="middle" align="center">Early Blight</th>
<th valign="middle" align="center">Healthy</th>
<th valign="middle" align="center">Late Blight</th>
<th valign="middle" align="center">Leaf Mold</th>
<th valign="middle" align="center">Target Spot</th>
<th valign="middle" align="center">Black Spot</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">P%</td>
<td valign="middle" align="center">85.4</td>
<td valign="middle" align="center">90.1</td>
<td valign="middle" align="center">99.8</td>
<td valign="middle" align="center">85.2</td>
<td valign="middle" align="center">71.9</td>
<td valign="middle" align="center">77.8</td>
<td valign="middle" align="center">80.3</td>
</tr>
<tr>
<td valign="middle" align="center">mAP%</td>
<td valign="middle" align="center">82.1</td>
<td valign="middle" align="center">88.0</td>
<td valign="middle" align="center">98.7</td>
<td valign="middle" align="center">82.4</td>
<td valign="middle" align="center">69.5</td>
<td valign="middle" align="center">74.1</td>
<td valign="middle" align="center">76.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Confusion matrix.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g013.tif">
<alt-text content-type="machine-generated">Normalized confusion matrix displaying predicted versus true classifications. Diagonal values indicate prediction accuracy for each class, with the highest accuracy being 1.00 for class 2. Color intensity corresponds to prediction accuracy, with darker shades representing higher values.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Ablation experiment</title>
<p>The proposed ToMASD model is based on YOLOv11n and has been optimised by introducing TAAM, Faster-GLUDet, and MDH. In order to evaluate the performance of each optimisation module, an experiment was conducted using the variable control method. The training and testing were carried out on the same dataset and training parameters, and the results are shown in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>. It is evident that Model C attained an mAP% of 77.9%, while concurrently sustaining a minimal computational cost. However, Model D, which introduced TAAM and Faster-GLUDet, exhibited an increase in computational cost to 12 FLOPs/G, attributable to parameter redundancy, yielding an accuracy of only 79.2%. The experiments indicated that the joint application of TAAM and MDH caused feature decoupling conflicts. The Model E achieved an 82.1% P% and 80.1% mAP%, thereby demonstrating the viability of multi-module collaborative optimisation through its lightweight design. The ToMASD model proposed in this paper was found to achieve the optimal balance in parameters, computational efficiency, and performance. In comparison with the baseline model A, it enhanced the accuracy by 9.4% whilst escalating the computational cost by a mere 0.4G. This outcome serves to demonstrate the efficacy of the collaborative design of multi-dimensional attention mechanisms and lightweight architectures for object detection tasks.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Results of model improvement ablation experiment.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">TAAM</th>
<th valign="middle" align="center">Faster-GLUDet</th>
<th valign="middle" align="center">MDH</th>
<th valign="middle" align="center">P%</th>
<th valign="middle" align="center">mAP%</th>
<th valign="middle" align="center">Parameters</th>
<th valign="middle" align="center">FLOPs/G</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">A</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">77.4</td>
<td valign="middle" align="center">72.6</td>
<td valign="middle" align="center">1.75&#xd7;10<sup>7</sup>
</td>
<td valign="middle" align="center">20.9</td>
</tr>
<tr>
<td valign="middle" align="center">B</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">79.9</td>
<td valign="middle" align="center">74.4</td>
<td valign="middle" align="center">2.45&#xd7;10<sup>6</sup>
</td>
<td valign="middle" align="center">6.1</td>
</tr>
<tr>
<td valign="middle" align="center">C</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">80.3</td>
<td valign="middle" align="center">77.9</td>
<td valign="middle" align="center">1.84&#xd7;10<sup>6</sup>
</td>
<td valign="middle" align="center">4.3</td>
</tr>
<tr>
<td valign="middle" align="center">D</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">79.2</td>
<td valign="middle" align="center">72.3</td>
<td valign="middle" align="center">4.46&#xd7;10<sup>6</sup>
</td>
<td valign="middle" align="center">12.0</td>
</tr>
<tr>
<td valign="middle" align="center">E</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">82.1</td>
<td valign="middle" align="center">80.1</td>
<td valign="middle" align="center">2.36&#xd7;10<sup>6</sup>
</td>
<td valign="middle" align="center">6.5</td>
</tr>
<tr>
<td valign="middle" align="center">F</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">78.8</td>
<td valign="middle" align="center">77.1</td>
<td valign="middle" align="center">2.69&#xd7;10<sup>6</sup>
</td>
<td valign="middle" align="center">7.6</td>
</tr>
<tr>
<td valign="middle" align="center">ToMASD</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">84.3</td>
<td valign="middle" align="center">81.7</td>
<td valign="middle" align="center">2.46&#xd7;10<sup>6</sup>
</td>
<td valign="middle" align="center">7.1</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2_4">
<label>3.2.4</label>
<title>Heat map visualization analysis</title>
<p>The present study employed the gradient-weighted class activation mapping technique (<xref ref-type="bibr" rid="B16">Selvaraju et&#xa0;al., 2017</xref>) to visualise the small target detection mechanism of the ToMASD model (<xref ref-type="fig" rid="f14">
<bold>Figure&#xa0;14</bold>
</xref>). The experimental findings demonstrate that the tomato leaf disease detection model, based on transfer learning, exhibits adapted feature capabilities in different crop disease recognition tasks. When the model is transferred from the source domain of tomatoes to the target domain of common beans, the heatmap analysis indicates that the lesion areas can still be effectively captured, although the the extent of the activated regions of highlighted areas is lower than that in the source domain. This suggests that the model has initially acquired the ability to locate disease spots across species through transfer learning. When the heatmap is transferred to potatoes with more distinct morphological features, more concentrated highlighted areas are shown, which may be related to the reticulate vein structure of potato leaves, thereby enhancing the distinguishability of texture features. It is noteworthy that the heatmaps of all three crops demonstrate a substantial contrast between the lesion areas and healthy tissues, thereby confirming that the model, while retaining key pathological features, has achieved adaptive adjustments to different crop leaf diseases through weight transfer. This cross-species disease recognition capability provides a feasible solution for intelligent diagnosis of multiple crop pests and diseases under resource-limited conditions.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Feature visualization of leaf disease patterns across different crops: <bold>(a)</bold> tomato; <bold>(b)</bold> common bean; <bold>(c)</bold> potato.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1644271-g014.tif">
<alt-text content-type="machine-generated">Thermal images display leaves of tomato, bean, and potato plants analyzed for stress or disease detection. The images show varying color patterns, with highlighted sections indicating different temperature levels, marked by rectangles.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<sec id="s4_1">
<label>4.1</label>
<title>Resource identification initiative</title>
<p>The present study proposes a detection model, ToMASD, which integrates multi-scale feature fusion and dynamic attention mechanisms, with a view to addressing the challenges of tomato leaf disease detection in complex agricultural environments. The prevailing field of agricultural disease detection frequently grapples with challenges such as leaf occlusion, a wide spectrum of lesion morphologies, and intricate lighting conditions. Conventional methodologies are predicated on manually designed features, which are challenging to adapt to the multi-scale representation requirements of dynamic field environments. Despite its promising performance, ToMASD has several limitations: real-time deployment, Although lightweight, the model may still struggle on low-end edgedevices. Knowledge distillation or quantization could further compress the model. The research under discussion addresses the issue of misalignment between diseased and healthy tissues by designing a Two-branch Adaptive Alignment Module with a dynamic weight allocation mechanism. Secondly, the Faster-GLUDet module enhances noise suppression capabilities while maintaining a lightweight model through a local context-aware gating unit. The innovative architecture of the model integrates deep convolution and pointwise convolution to construct gating signals, thereby achieving a substantial reduction in background false detection rates, particularly in conditions characterized by foggy and strong light. It is evident that the multi-scale decoupled detection head (MDH) successfully achieves balanced detection of both small and diffuse lesions. This is achieved through the implementation of group normalisation and the establishment of independent classification and regression branches. A series of ablation experiments were conducted, the results of which demonstrate that MDH enhances the mAP value of imbalanced datasets by 7.8%, particularly with regard to the recognition accuracy of low-contrast diseases such as leaf mold.</p>
<p>Despite the fact that ToMASD demonstrates superiority over existing models in a number of metrics, there is still scope for enhancement. For instance, the mAP for Black Spot detection is only 76.1%, indicating that the ability to distinguish similar texture diseases needs further optimization. It is recommended that future research explore the potential of knowledge distillation techniques to further compress the model size. In addition, the combination of spatio-temporal features could enhance the prediction of disease spread dynamics. In conclusion, ToMASD has been demonstrated to provide an efficient and reliable solution for precise disease diagnosis in complex agricultural scenarios through the collaborative design of multi-dimensional attention mechanisms and lightweight architectures. This suggests that there is significant practical value and potential for promotion.</p>
<p>This study further validates the potential of ToMASD model for transfer learning in cross-crop disease detection. The weights of the tomato disease detection model were migrated to the potato and common bean tasks, and the cross-crop features were fused by domain adaptive layer state, and an adversarial feature decoupling strategy was used to suppress the inter-domain distribution bias. Experiments show that the migration model achieves 93.5% and 92.1% accuracy values on the potato and common bean test sets, respectively. The technical innovation is that by decoupling the cross-crop shared features and crop-specific features, the model overcomes the problem of confusing the small brown spots of bean rust with the background leaf vein texture while retaining the accuracy of tomato disease detection, supporting the joint monitoring of multi-crop diseases. The results show that the lightweight architecture and domain adaptive mechanism of ToMASD provide a cross-species generalization paradigm for building a general-purpose agricultural disease monitoring system, which significantly reduces the model development cost for multi-crop disease detection.</p>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusions</title>
<p>This study proposed ToMASD, a novel lightweight detection model, to address the critical challenges of tomato disease detection in complex agricultural environments. The core contributions of this work are threefold, each validated by extensive experimental results:</p>
<p>Firstly, to mitigate the feature attenuation of small lesions and the misalignment interference between diseased and healthy tissues, we designed the Two-branch Adaptive Alignment Module (TAAM). This module dynamically aligns cross-scale features, which was a key factor in achieving the overall mAP of 81.7%, a significant improvement over all baseline models.</p>
<p>Secondly, to enhance feature representation while suppressing complex background noise, we integrated the Faster-GLUDet feature enhancement unit. Its local context-aware gating mechanism effectively reduced false positives, as evidenced by the low misdetection rates of 6.3% in foggy and 9.8% in strong light conditions, while maintaining a low computational cost of only 7.1 GFLOPs.</p>
<p>Thirdly, to balance the detection accuracy between tiny spots and large lesions, we developed the Multi-scale Decoupling Head (MDH). By employing Group Normalization and independent task-specific branches, the MDH ensured balanced detection, which is reflected in the stable performance across all six disease categories, even under severe class imbalance.</p>
<p>Future research can be extended in three aspects: first, integrating hyperspectral imaging technology to enhance the characterisation of chromaticity gradient diseases; second, verifying the generalization of the model in economic crops such as chili peppers, grapes. based on the current migration learning framework; third, exploring the deployment scheme of edge computing to build a low-power field monitoring network in combination with LoRaWAN wireless transmission, to realize spatio-temporal prediction of disease spreading and precise prevention and control. The study is based on a lightweight architecture and a low-power edge computing deployment scheme. Through the deep integration of lightweight architecture and migration learning technology, this study provides a scalable solution for intelligent diagnosis of agricultural diseases, promotes the digital transformation of disease monitoring from single-crop scenarios to multi-species and multi-environment collaborative management, and assists the sustainable development of agriculture.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <uri xlink:href="https://gitcode.com/open-source-toolkit/829ec">https://gitcode.com/open-source-toolkit/829ec</uri>.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>JH: Investigation, Writing &#x2013; original draft, Formal Analysis, Software, Conceptualization, Data curation, Methodology, Writing &#x2013; review &amp; editing. JL: Conceptualization, Resources, Funding acquisition, Writing &#x2013; review &amp; editing, Writing &#x2013; original draft, Visualization, Validation, Formal analysis. HW: Supervision, Data curation, Writing &#x2013; review &amp; editing, Validation, Writing &#x2013; original draft, Resources.</p>
</sec>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research and/or publication of this article. This research was funded by the The Natural Science Foundation of Jilin Province (20240101209JC).</p>
</sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Astani</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Hasheminejad</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Vaghefi</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A diverse ensemble classifier for tomato disease recognition</article-title>. <source>Comput. Electron Agric.</source> <volume>198</volume>, <elocation-id>107054</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107054</pub-id>
</citation></ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barbedo</surname> <given-names>J. G. A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Impact of dataset size and variety on the effectiveness of deep learning and transfer learning for plant disease classification</article-title>. <source>Comput. Electron Agric.</source> <volume>153</volume>, <fpage>46</fpage>&#x2013;<lpage>53</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.08.013</pub-id>
</citation></ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bhagat</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Haque</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Munda</surname> <given-names>H. S.</given-names>
</name>
<name>
<surname>Bhagat</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Plant leaf disease classification using grid search based SVM</article-title>,&#x201d; in <conf-name>2nd International Conference on Data, Engineering and Applications (IDEA)</conf-name>. <fpage>1</fpage>&#x2013;<lpage>6</lpage> (<publisher-loc>Bhopal, India</publisher-loc>: <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IDEA49133.2020.9170725</pub-id>
</citation></ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhuo</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>C.-H.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Run, don&#x2019;t walk: chasing higher FLOPS for faster neural networks</article-title>. <source>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>, <publisher-loc>Vancouver, BC, Canada</publisher-loc> doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.01157</pub-id>
</citation></ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cong</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Research on instance segmentation algorithm of greenhouse sweet pepper detection based on improved mask RCNN</article-title>. <source>Agronomy</source> <volume>13</volume>, <elocation-id>196</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy13010196</pub-id>
</citation></ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Debnath</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Md.</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Raihan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Samrat</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Alsulami</surname> <given-names>M. M.</given-names>
</name>
<name>
<surname>Masud</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>A smartphone-based detection system for tomato leaf disease using efficientNetV2B2 and its explainability with artificial intelligence (AI)</article-title>. <source>Sensors</source> <volume>23</volume>, <elocation-id>8685</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s23218685</pub-id>, PMID: <pub-id pub-id-type="pmid">37960385</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Javidan</surname> <given-names>S. M.</given-names>
</name>
<name>
<surname>Banakar</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Vakilian</surname> <given-names>K. A.</given-names>
</name>
<name>
<surname>Ampatzidis</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Diagnosis of grape leaf diseases using automatic K-means clustering and machine learning</article-title>. <source>Smart Agric. Technol.</source> <volume>3</volume>, <elocation-id>100081</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.atech.2022.100081</pub-id>
</citation></ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Weng</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Geng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>YOLOv6: A single-stage object detection framework for industrial applications</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2209.02976</pub-id>
</citation></ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Anguelov</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Erhan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Szegedy</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Reed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>C.-Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). <article-title>SSD: single shot multiBox detector</article-title> <fpage>21</fpage>&#x2013;<lpage>37</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-46448-0_2</pub-id>
</citation></ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Using hyperspectral imaging to discriminate yellow leaf curl disease in tomato leaves</article-title>. <source>Precis Agric.</source> <volume>19</volume>, <fpage>379</fpage>&#x2013;<lpage>394</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-017-9524-7</pub-id>
</citation></ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Panno</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Davino</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Caruso</surname> <given-names>A. G.</given-names>
</name>
<name>
<surname>Bertacca</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Crnogorac</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Mandi&#x107;</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A review of the most common and economically important diseases that undermine the cultivation of tomato crop in the mediterranean basin</article-title>. <source>Agronomy</source> <volume>11</volume>, <elocation-id>2188</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy11112188</pub-id>
</citation></ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>R.</surname> <given-names>K.</given-names>
</name>
<name>
<surname>M.</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Anand</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Mathikshara</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Johnson</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Menaka</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Attention embedded residual CNN for disease detection in tomato leaves</article-title>. <source>Appl. Soft Comput.</source> <volume>86</volume>, <elocation-id>105933</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.asoc.2019.105933</pub-id>
</citation></ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>YOLOv3: an incremental improvement</article-title>.</citation></ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rodr&#xed;guez</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lizarazo</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Prieto</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Angulo-Morales</surname> <given-names>V.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Assessment of potato late blight from UAV-based multispectral imagery</article-title>. <source>Comput. Electron Agric.</source> <volume>184</volume>, <elocation-id>106061</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106061</pub-id>
</citation></ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saleem</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sharif</surname> <given-names>M. I.</given-names>
</name>
<name>
<surname>Sharif</surname> <given-names>M. I.</given-names>
</name>
<name>
<surname>Sajid</surname> <given-names>M. Z.</given-names>
</name>
<name>
<surname>Marinello</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Comparison of deep learning models for multi-crop leaf disease detection with enhanced vegetative feature isolation and definition of a new hybrid architecture</article-title>. <source>Agronomy</source> <volume>14</volume>, <elocation-id>2230</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy14102230</pub-id>
</citation></ref>
<ref id="B16">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Selvaraju</surname> <given-names>R. R.</given-names>
</name>
<name>
<surname>Cogswell</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Das</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Vedantam</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Parikh</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Batra</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Computer Vision (ICCV)</conf-name>. <fpage>618</fpage>&#x2013;<lpage>626</lpage> (<publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.74</pub-id>
</citation></ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>TransNeXt: robust foveal visual perception for vision transformers</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01683</pub-id>
</citation></ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shin</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>Y. K.</given-names>
</name>
<name>
<surname>Heung</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Nguyen-Quang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Price</surname> <given-names>G. W.</given-names>
</name>
<name>
<surname>Al-Mallahi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A deep learning approach for RGB image-based powdery mildew disease detection on strawberry leaves</article-title>. <source>Comput. Electron Agric.</source> <volume>183</volume>, <elocation-id>106042</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106042</pub-id>
</citation></ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00721</pub-id>
</citation></ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>a). <article-title>YOLOv10: real-time end-to-end object detection</article-title>. <source>ArXiv</source>, <elocation-id>abs/2405.14458</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-031-72751-1_1</pub-id>
</citation></ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Yeh</surname> <given-names>I.-H.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2024</year>b). <article-title>YOLOv9: learning what you want to learn using programmable gradient information</article-title>.</citation></ref>
</ref-list>
</back>
</article>