<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1639269</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>CNATNet: a convolution-attention hybrid network for safflower classification</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Ma</surname>
<given-names>Pengwei</given-names>
</name>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3169324/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Lian</surname>
<given-names>Nan</given-names>
</name>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3066824/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Dong</surname>
<given-names>Leilei</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Luo</surname>
<given-names>Yunchen</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sun</surname>
<given-names>Zheng</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhu</surname>
<given-names>Yuanjiao</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Zefang</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhou</surname>
<given-names>Jie</given-names>
</name>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>College of Information Science and Technology, Shihezi University</institution>, <addr-line>Shihezi</addr-line>,&#xa0;<country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1886836/overview">Zhenghong Yu</ext-link>, Guangdong Polytechnic of Science and Technology, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2019445/overview">Jing Yao</ext-link>, Chinese Academy of Sciences (CAS), China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2949049/overview">Tianyu Liu</ext-link>, Hunan Agricultural University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Jie Zhou, <email xlink:href="mailto:jiezhou@shzu.edu.cn">jiezhou@shzu.edu.cn</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1639269</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Ma, Lian, Dong, Luo, Sun, Zhu, Chen and Zhou.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Ma, Lian, Dong, Luo, Sun, Zhu, Chen and Zhou</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Safflower (Carthamus tinctorius L.) is an important medicinal and economic crop, where efficient and accurate filament grading is essential for quality control in agricultural and pharmaceutical applications. However, current methods rely on manual inspection, which is time-consuming and difficult to scale. A coarse-to-fine grading framework is established, consisting of cluster-level classification for rapid assessment and filament-level fine-grained classification. To implement this framework, a lightweight hybrid network, CNATNet, is designed by integrating convolutional operations and attention mechanisms. The classical C2f feature extraction module is optimized into two components: C2S2, a lightweight convolutional variant with cascaded split connections, and AnC2f, an n-order local attention mechanism. A depthwise separable convolution-based head (DWClassify) is further employed to accelerate inference while maintaining accuracy. Experiments on a high-resolution safflower filament dataset indicate that CNATNet achieves 98.6% accuracy at the cluster level and 95.6% at the filament level, with an average latency of 1.9 ms per image. Compared with representative baselines such as YOLOv11m and RT-DETRv2s, CNATNet consistently yields higher accuracy with reduced latency. Moreover, deployment on the Jetson Orin Nano demonstrates real-time performance at 63 FPS under 15 W, confirming its feasibility for embedded agricultural grading in resource-constrained environments. These results suggest that CNATNet provides a task-specific lightweight solution balancing accuracy and efficiency, with strong potential for practical safflower quality classification.</p>
</abstract>
<kwd-group>
<kwd>safflower classification</kwd>
<kwd>deep learning</kwd>
<kwd>CNN-attention hybrid</kwd>
<kwd>C2S2</kwd>
<kwd>AnC2f</kwd>
<kwd>DWClassify</kwd>
</kwd-group>
<counts>
<fig-count count="12"/>
<table-count count="6"/>
<equation-count count="10"/>
<ref-count count="42"/>
<page-count count="17"/>
<word-count count="6964"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Efficient classification of safflower filaments is a key challenge in modern agricultural quality control, given their significant economic and medicinal value. Safflower (Carthamus tinctorius L.) is widely recognized for its pharmacological effects, including promoting blood circulation, anti-inflammation, and antioxidation (<xref ref-type="bibr" rid="B21">Pu et&#xa0;al., 2019</xref>). Benefiting from favorable climatic conditions, Xinjiang Province accounts for more than 75% of China&#x2019;s safflower production, producing filaments with high active compound content, bright color, and intact structure (<xref ref-type="bibr" rid="B15">Lin et&#xa0;al., 2020</xref>). In practical production scenarios, filament color, texture, and integrity are critical indicators for assessing safflower quality. However, large-scale filament grading still relies on manual visual inspection, which suffers from high labor intensity, subjectivity, and limited scalability, failing to meet the demands of standardized and efficient production.</p>
<p>To address these limitations, researchers have explored various analytical techniques for safflower-specific applications. For instance, hyperspectral imaging combined with machine learning models has been successfully applied to monitor drought stress in safflower, enabling precise classification of plant health states (<xref ref-type="bibr" rid="B24">Salek et&#xa0;al., 2024</xref>). Similarly, computer-aided decision-making systems based on spectral reflectance data have been developed to optimize irrigation strategies and assess safflower quality, demonstrating the feasibility of intelligent agricultural management (<xref ref-type="bibr" rid="B10">Karada&#x11f;, 2022</xref>). In terms of product safety and quality control, machine learning-assisted surface-enhanced Raman spectroscopy (SERS) sensors have been employed for rapid detection of illegal dye additives in safflower products, facilitating highly sensitive and on-site hazardous substance analysis (<xref ref-type="bibr" rid="B14">Lin et&#xa0;al., 2024</xref>). Furthermore, a practical &#x201c;indistinct&#x201d; evaluation method, integrating bioactivity assays with visual character analysis, has been proposed to establish efficient and low-cost quality grading standards for safflower (<xref ref-type="bibr" rid="B41">Zhou et&#xa0;al., 2023</xref>). While these safflower-related studies have achieved notable progress, complementary research on saffron (Crocus sativus L.) provides valuable technical references. Methods such as E-nose combined with gas chromatography-mass spectrometry (GC-MS) (<xref ref-type="bibr" rid="B26">Sun et&#xa0;al., 2022</xref>) and UHPLC-HRMS/MS-based metabolomics (<xref ref-type="bibr" rid="B23">Ryparova Kvirencova et&#xa0;al., 2023</xref>) have shown effectiveness in detecting adulteration and ensuring product authenticity. However, these approaches often rely on sophisticated instrumentation and complex workflows, which limit their applicability for real-time, large-scale safflower classification tasks.</p>
<p>In recent years, vision-based deep learning has shown strong potential in automated plant phenotype analysis and quality evaluation. Deep neural networks have been successfully applied to tasks such as safflower germplasm classification, demonstrating high accuracy under field conditions (<xref ref-type="bibr" rid="B27">Van et&#xa0;al., 2025</xref>). For fine-grained tasks like filament-level analysis, CNN models designed for unstructured environments have also shown promising results (<xref ref-type="bibr" rid="B2">Chen et&#xa0;al., 2024a</xref>). However, the visual complexity of safflower filaments&#x2014;including subtle textures, shape variability, and frequent overlaps&#x2014;poses challenges for real-time and precise classification. These factors often lead to a trade-off between accuracy and inference speed, limiting the practical deployment of current models in large-scale agricultural systems. To improve feature extraction and reduce model complexity, several studies have proposed lightweight modifications to core network modules. For example, <xref ref-type="bibr" rid="B34">Wang and Liu (2024)</xref> introduced a GhostConv-based variant of the C2f module, which significantly reduced parameter counts while maintaining detection accuracy. In another study, <xref ref-type="bibr" rid="B33">Wang et&#xa0;al. (2025)</xref> proposed a pyramid-style C2f structure to enhance multi-scale feature learning and improve computational efficiency. Meanwhile, efforts have also focused on integrating attention mechanisms into feature fusion designs. <xref ref-type="bibr" rid="B19">Miao et&#xa0;al. (2025)</xref> developed a dynamic convolution and spatial attention architecture to better capture both local and global semantics. In the field of crop quality grading, <xref ref-type="bibr" rid="B40">Zhao et&#xa0;al. (2024)</xref> presented an attention-enhanced classification framework that substantially improved prediction accuracy in complex agricultural scenarios. In addition, lightweight detection heads have been proposed to reduce computation costs during inference, particularly for use in embedded or resource-constrained agricultural environments (<xref ref-type="bibr" rid="B22">Qing et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B33">Wang et&#xa0;al., 2025</xref>). Despite these advancements, existing approaches remain focused on general object detection and often fail to address the fine-grained structural characteristics of safflower filaments. Attention modules tend to emphasize global context while overlooking local morphological cues such as edge continuity and curvature. Furthermore, decoupled classification heads may introduce information loss, reducing model reliability in detail-sensitive tasks. These limitations underscore the need for a model that is both lightweight and capable of fine-structure modeling, specifically tailored for real-time filament-level classification in agricultural applications.</p>
<p>Beyond CNN- and attention-based lightweight designs, recent alternative paradigms have emerged. For hyperspectral scenarios, SpectralMamba (<xref ref-type="bibr" rid="B37">Yao et&#xa0;al., 2024</xref>) adopts a state-space-model backbone that combines gated spatial&#x2013;spectral interaction with efficient sequential modeling to balance accuracy and efficiency. Complementarily, SPECIAL (<xref ref-type="bibr" rid="B20">Pang et&#xa0;al., 2025</xref>) presents a CLIP-based zero-shot pipeline that interpolates HSI into RGB bands to obtain pseudo-labels and then refines them via noisy-label learning. While our study targets RGB-based safflower filament grading under a supervised setting, these principles&#x2014;efficient state&#x2013;space feature interaction and label-efficient supervision&#x2014;outline promising directions for future lightweight classification systems in spectral or multi-modal agricultural applications.</p>
<p>To address these challenges, this study proposes CNATNet, a lightweight convolution-attention hybrid model designed for efficient and accurate safflower filament classification. CNATNet integrates multi-branch feature extraction, attention-enhanced feature fusion, and lightweight prediction modules to ensure high classification accuracy while meeting practical requirements for low latency and limited computational resources. Specifically, CNATNet is tailored to the morphological characteristics of safflower filaments, adopting a coarse-to-fine recognition strategy that progressively refines feature representations from cluster-level patterns to filament-level details. The model architecture builds upon established lightweight designs, including MobileNetV2 (<xref ref-type="bibr" rid="B25">Sandler et&#xa0;al., 2018</xref>), MobileNetV3 (<xref ref-type="bibr" rid="B8">Howard et&#xa0;al., 2019</xref>), and GhostViT (<xref ref-type="bibr" rid="B1">Cao et&#xa0;al., 2024</xref>), achieving an effective balance between recognition performance and computational efficiency. On this basis, CNATNet introduces the following key contributions:</p>
<list list-type="order">
<list-item>
<p>C2S2 convolution module: To enhance feature extraction efficiency, a novel C2S2 module is designed by partitioning feature channels into multiple lightweight parallel branches, enabling efficient convolution operations. The cascaded connection structure further strengthens the module&#x2019;s capacity to capture filament directionality, preserve curvature continuity, and maintain edge integrity. This lightweight design allows the network to effectively represent fine-grained morphological features critical for accurate filament classification under complex visual conditions.</p>
</list-item>
<list-item>
<p>AnC2f attention module: To better cope with the fine-grained texture and overlapping structures of safflower, an attention mechanism is incorporated. AnC2f enhances the model&#x2019;s sensitivity to key spatial regions and multi-scale cues by gradually stacking lightweight attention modules in the residual fusion path. This design improves the network&#x2019;s ability to capture subtle structural changes, enabling more accurate distinction between high-quality and ordinary-grade filaments.</p>
</list-item>
<list-item>
<p>DWClassify lightweight classification head: By decoupling spatial and channel features through depthwise separable convolution, DWClassify substantially reduces computational complexity without sacrificing prediction accuracy. This ensures that the model can achieve real-time inference not only on high-performance GPUs but also on resource-constrained embedded platforms, reinforcing its lightweight nature.</p>
</list-item>
<list-item>
<p>Deployment on Jetson Orin Nano: The optimized CNATNet model was deployed on the Jetson Orin Nano, a high-performance yet power-efficient embedded platform. Real-time inference was achieved for safflower filament and cluster classification at 63 FPS under 15 W, maintaining high accuracy under constrained computational budgets. These results confirm that CNATNet is not only accurate but also lightweight and deployment-ready, providing a practical and scalable solution for intelligent plant quality assessment in modern agriculture.</p>
</list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data collection</title>
<p>Xinjiang is the largest safflower-producing region in China, with its favorable climate and soil conditions contributing to filaments that are superior in color, texture, and bioactive compound content compared to those from other regions. The experimental sites for this study were located in the main safflower production areas of Changji Hui Autonomous Prefecture, including Changji City, Manas County, and Hutubi County. The collected samples from these areas were characterized by vivid coloration, fine and intact filament structure, and high medicinal and economic value.</p>
<p>Traditionally, safflower classification relies on quantifying active constituents such as safflower yellow pigment. However, these methods are time-consuming and involve complex chemical operations, making them unsuitable for real-time classification or automated sorting systems. To meet the demands of automation, this study adopts a visual classification standard based on the appearance of safflower filaments. By combining field observations and expert feedback from local growers, the filaments were categorized into two major quality classes: premium-grade filaments suitable for medicinal applications, and regular-grade filaments intended for auxiliary uses. Premium-grade filaments are defined by their bright color, compact structure, and high fiber integrity, making them ideal for clinical or health-related purposes. In contrast, regular-grade filaments exhibit dimmer coloration, inconsistent thickness, and lower fiber quality, and are better suited for daily health care, soaking, or pigment extraction. This classification standard is practical for both manual annotation and automated model training and effectively captures intrinsic quality differences. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> presents representative examples of premium-grade and regular-grade single safflower filaments, illustrating their visual distinctions in color, texture, and structural integrity.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Visual comparison of safflower filaments across different quality grades. <bold>(A)</bold> High-quality <italic>Carthamus tinctorius</italic> (L.) filament. <bold>(B)</bold> Common-quality <italic>Carthamus tinctorius</italic> (L.) filament.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g001.tif">
<alt-text content-type="machine-generated">Two panels labeled A and B, each showing five small, elongated red-orange plant structures on a light background. Panel A features more uniformly shaped forms, while panel B displays slightly more varied shapes with yellow and orange hues.</alt-text>
</graphic>
</fig>
<p>From June to August 2024, safflower filament images were acquired across multiple plantation sites in Changji Prefecture using a mobile imaging setup equipped with an iPhone 14 Pro Max. All samples were selected from post-harvest and naturally air-dried filaments to ensure consistency with real-world processing conditions. To enhance dataset diversity and improve the robustness of model generalization, images were acquired under diverse environmental conditions. Specifically, the dataset was constructed by capturing images across various camera angles, illumination levels, and background complexities. The acquisition scenarios included:</p>
<p>(i) outdoor environments under natural daylight; (ii) indoor settings with diffuse artificial lighting; (iii) backgrounds exhibiting different levels of clutter and occlusion.</p>
<p>A total of 5,800 images were acquired at a resolution of 3840&#xd7;2160 pixels and saved in PNG format to ensure lossless preservation. The dataset comprises both scattered single filaments and densely stacked floral clusters. Image acquisition was performed at various distances, ranging from close-up views (<italic>&lt;</italic>0.15 m) to medium-to-long ranges (<italic>&gt;</italic>0.15 m), thereby enhancing the robustness and generalization capability of the proposed classification model.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Dataset creation and annotation</title>
<p>To ensure annotation accuracy prior to model training, all original safflower filament images were labeled using the Roboflow platform. Each image was annotated with a single instance representing either a premium-grade or regular-grade filament. The two categories were respectively labeled as <italic>&#x201c;CT Premium&#x201d;</italic> and <italic>&#x201c;CT Normal&#x201d;</italic>. Each image contains only one target object, either an isolated filament or a dense filament cluster, assigned with the appropriate class label. Considering the influence of environmental variation&#x2014;such as fluctuating lighting and complex backgrounds&#x2014;this study adopts a binary classification strategy based on single-instance images. The model directly learns quality classification from global visual features of individual filaments. This strategy enables the model to focus on overall filament appearance, thereby improving classification accuracy and robustness under challenging real-world conditions. To visually illustrate the environmental variations considered in the proposed classification strategy, representative filament samples captured under different lighting conditions are shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>. For each quality grade, the left panel presents filaments photographed under natural daylight, while the right panel corresponds to images captured with diffuse artificial lighting. Additionally, the lower row displays individual filaments extracted from the bulk samples, serving as single-instance inputs for classification.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Visual comparison of safflower filaments under different quality grades and lighting conditions. <bold>(A)</bold> High-quality sample. <bold>(B)</bold> Common-quality sample.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g002.tif">
<alt-text content-type="machine-generated">Panel A shows two piles of saffron filaments with variations in color and density. Below are five extracted saffron filaments displayed in a row. Panel B displays similar piles of saffron filaments and a row of five extracted saffron filaments with subtle differences from Panel A.</alt-text>
</graphic>
</fig>
<p>The safflower filament dataset used in this study was constructed through systematic image acquisition under diverse environmental conditions. To ensure a comprehensive representation of real-world scenarios, images were captured across multiple variables, including filament quantity (single or multiple filaments), shooting distance (close-up or distant view), illumination condition (natural or supplementary lighting), camera angle, and background complexity. Owing to natural variations in field cultivation and post-harvest processing, the collected dataset exhibits differences in filament quality, structure, and visual appearance. A total of 5,800 high-resolution images (3840&#xd7;2160 pixels) were acquired, covering both premium-grade and common-grade safflower filaments. The detailed class distribution is summarized in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Class distribution of the safflower dataset under various acquisition conditions.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Scenario</th>
<th valign="middle" align="center">Category</th>
<th valign="middle" align="center">Premium</th>
<th valign="middle" align="center">Normal</th>
<th valign="middle" align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="2" align="center">Filament quantity</td>
<td valign="middle" align="center">Single filaments</td>
<td valign="middle" align="center">1650</td>
<td valign="middle" align="center">1350</td>
<td valign="middle" align="center">3000</td>
</tr>
<tr>
<td valign="middle" align="center">Multiple filaments</td>
<td valign="middle" align="center">1450</td>
<td valign="middle" align="center">1350</td>
<td valign="middle" align="center">2800</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="center">Shooting distance</td>
<td valign="middle" align="center">Close-up view</td>
<td valign="middle" align="center">1750</td>
<td valign="middle" align="center">1550</td>
<td valign="middle" align="center">3300</td>
</tr>
<tr>
<td valign="middle" align="center">Distant view</td>
<td valign="middle" align="center">1350</td>
<td valign="middle" align="center">1150</td>
<td valign="middle" align="center">2500</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="center">Lighting condition</td>
<td valign="middle" align="center">Natural lighting</td>
<td valign="middle" align="center">1900</td>
<td valign="middle" align="center">1600</td>
<td valign="middle" align="center">3500</td>
</tr>
<tr>
<td valign="middle" align="center">Supplementary lighting</td>
<td valign="middle" align="center">1200</td>
<td valign="middle" align="center">1100</td>
<td valign="middle" align="center">2300</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="center">Camera angle</td>
<td valign="middle" align="center">Frontal view</td>
<td valign="middle" align="center">1600</td>
<td valign="middle" align="center">1400</td>
<td valign="middle" align="center">3000</td>
</tr>
<tr>
<td valign="middle" align="center">Multi-angle view</td>
<td valign="middle" align="center">1500</td>
<td valign="middle" align="center">1300</td>
<td valign="middle" align="center">2800</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Background complexity</td>
<td valign="middle" align="center">Clean background</td>
<td valign="middle" align="center">1800</td>
<td valign="middle" align="center">1500</td>
<td valign="middle" align="center">3300</td>
</tr>
<tr>
<td valign="middle" align="center">Cluttered background</td>
<td valign="middle" align="center">900</td>
<td valign="middle" align="center">800</td>
<td valign="middle" align="center">1700</td>
</tr>
<tr>
<td valign="middle" align="center">Occluded background</td>
<td valign="middle" align="center">800</td>
<td valign="middle" align="center">700</td>
<td valign="middle" align="center">1500</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To support downstream model training and evaluation, the full dataset was randomly partitioned into training, validation, and testing subsets at a ratio of 7:2:1. The class distribution was maintained consistently across all subsets to ensure balanced data quality and enable fair performance comparisons. The collected images cover diverse acquisition conditions, including variations in shooting angles, shot distances, and environmental complexities, to enhance model robustness and generalization, as illustrated in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. To mitigate overfitting and reduce sensitivity to class distribution bias, multiple data augmentation techniques were applied during training. Specifically, random rotation, horizontal flipping, brightness adjustment, and affine transformations were employed to simulate variations in angle, lighting, and background complexity. These augmentations not only enrich the diversity of training samples but also enhance the model&#x2019;s generalization capability under challenging conditions. Representative examples of these augmentation strategies are illustrated in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Visual examples of safflower captured under different acquisition conditions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g003.tif">
<alt-text content-type="machine-generated">Array of ten images showing a pile of saffron from different angles, distances, and environments. &#x201c;Different Angles&#x201d; displays five varied perspectives. &#x201c;Different Distances&#x201d; features close-up to far views. &#x201c;Different Environments&#x201d; showcases lighting variations. The saffron is vibrant orange-red with yellow strands.</alt-text>
</graphic>
</fig>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Representative examples of augmented safflower filament images. <bold>(A)</bold> Original image, <bold>(B)</bold> random rotation, <bold>(C)</bold> Horizontal flipping, <bold>(D)</bold> Brightness adjustment.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g004.tif">
<alt-text content-type="machine-generated">Four images of saffron displayed in different editing techniques: (A) Original image, showing red strands on a black background. (B) Random rotation, displaying the same strands slightly tilted. (C) Horizontal flipping, with the image mirrored. (D) Brightness adjustment, where the image appears lighter.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Model architecture</title>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>CNATNet framework overview</title>
<p>CNATNet is a hybrid network that integrates convolutional neural networks with attention mechanisms. It consists of two main components: a backbone for multi-scale feature extraction and fusion, and a classification head for safflower filament quality prediction. As illustrated in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, the architecture integrates both convolutional and attention-based modules across feature extraction and classification stages. Specifically, the backbone incorporates convolution-centric C2S2 modules and attention-enhanced AnC2f modules, forming a synergistic structure that leverages the strengths of both paradigms. The C2S2 module focuses on efficient representation learning with reduced computational cost (<xref ref-type="bibr" rid="B3">Chen et&#xa0;al., 2024b</xref>; <xref ref-type="bibr" rid="B16">Liu et&#xa0;al., 2024</xref>), while the AnC2f module enhances spatial attention and multi-scale perception through stacked attention blocks (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B17">Lv et&#xa0;al., 2022</xref>). For final prediction, CNATNet employs a lightweight DWClassify head based on depthwise separable convolutions to minimize parameter count and optimize inference speed (<xref ref-type="bibr" rid="B5">Dwika Hefni Al-Fahsi et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B6">Gao et&#xa0;al., 2021</xref>).</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>CNATNet structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g005.tif">
<alt-text content-type="machine-generated">Flowchart of a neural network model for image classification. The input is an image of saffron threads, processed through a series of convolutional layers labeled &#x201c;C2S2&#x201d; and &#x201c;AnC2f&#x201d; in the &#x201c;Backbone CNN&#x201d; and &#x201c;Backbone Attention&#x201d; sections. The final processing stage, labeled &#x201c;DWClassify&#x201d; in the &#x201c;Head&#x201d; section, outputs the same image labeled &#x201c;PREMIUM&#x201d;.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>C2S2: cascaded split-and-concatenate structure</title>
<p>C2S2 is a lightweight convolutional feature extractor designed for early-stage processing within CNATNet. It starts with a standard convolution to extract base-level features, followed by a channel-wise split into two parallel branches. Each branch processes features through stacked GhostBottleneck or Bottleneck layers to enhance representational capacity while reducing parameter overhead (<xref ref-type="bibr" rid="B9">Huang and Wang, 2025</xref>; <xref ref-type="bibr" rid="B39">Yu and Zhou, 2023</xref>). The internal architecture of C2S2 is illustrated in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, which details the feature flow, dual-branch operations, and lightweight convolutional structure.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Internal architecture of the C2S2 module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g006.tif">
<alt-text content-type="machine-generated">Flowchart of a neural network submodule. It begins with a Conv layer, followed by two Split layers. Parallel branches contain GBN and BN layers. A Concat layer merges outputs, leading to another Conv layer.</alt-text>
</graphic>
</fig>
<p>After local refinement in each branch, a bottleneck layer compresses channel dimensions to extract key features, which are then concatenated across branches to form the unified output. Mathematically, the feature fusion process of C2S2 can be expressed as (<xref ref-type="disp-formula" rid="eq1">Equation 1</xref>):</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mstyle>
<mml:mtext>Concat</mml:mtext>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>X</italic>
<sub>1</sub> and <italic>X</italic>
<sub>2</sub> represent the channel-wise partitions of the input feature map <italic>X</italic>, which are independently processed by the branch-specific transformation functions <italic>f</italic>
<sub>1</sub>(&#xb7;) and <italic>f</italic>
<sub>2</sub>(&#xb7;), respectively. The outputs of these branches are concatenated along the channel dimension by the Concat(&#xb7;) operation, resulting in the unified feature representation <italic>F<sub>C</sub>
</italic>
<sub>2</sub>
<italic>
<sub>S</sub>
</italic>
<sub>2</sub>.</p>
<p>The parameter complexity of C2S2 is quantified in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:munderover>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>C<sub>split</sub>
</italic> denotes the number of channels assigned to each branch after splitting, <italic>K</italic> represents the convolution kernel size, <italic>C<sub>merge</sub>
</italic> refers to the number of channels in the fusion (bottleneck) layer, and <italic>C<sub>out</sub>
</italic> is the output channel number of the C2S2 module. The first summation term calculates the cumulative parameter count of the dual-branch convolutions, each operating within its respective channel partition, while the final term accounts for the parameters introduced by the merging operation that recombines the branch-specific features.</p>
<p>By leveraging its parallel structure, C2S2 achieves a balanced trade-off among computational efficiency, feature diversity, and structural consistency, making it particularly well-suited for processing safflower filament images with directional textures and densely packed patterns (<xref ref-type="bibr" rid="B11">Lau et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B42">Zhu et&#xa0;al., 2024</xref>). This design ensures robust feature extraction while maintaining low computational overhead, which is critical for practical deployment in resource-constrained environments.</p>
</sec>
<sec id="s2_3_3">
<label>2.3.3</label>
<title>AnC2f: attention-enhanced cross-stage fusion</title>
<p>AnC2f is an attention-driven fusion module inspired by the C2f architecture from YOLOv8 and the original CSPNet structure (<xref ref-type="bibr" rid="B28">Varghese and M, 2024</xref>; <xref ref-type="bibr" rid="B32">Wang et&#xa0;al., 2019</xref>). It enhances cross-stage learning by injecting multiple stacked Attention Blocks (ABlocks) into the fusion process. The structural details of the AnC2f module are depicted in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>. As shown, AnC2f retains the dual-path feature flow of C2f, where the input features are split into two branches: a shortcut branch for direct feature propagation, and a main branch for progressive feature refinement.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Structural comparison among CSPNet <bold>(A)</bold>, C2f <bold>(B)</bold>, and the proposed AnC2f <bold>(C)</bold>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g007.tif">
<alt-text content-type="machine-generated">Diagrams compare three network architectures: CSPNet, C2f, and AnC2f. CSPNet uses a split and transition with `conv` and `block` layers before concatenation. C2f involves multiple `conv` layers post-split before concatenating. AnC2f includes `A2` layers followed by scaling and concatenation. Each structure features transition and split mechanisms.</alt-text>
</graphic>
</fig>
<p>In the main processing branch, stacked Attention Blocks (ABlocks) are applied to modulate the input features through spatially adaptive weighting. This modulation process is formulated as (<xref ref-type="disp-formula" rid="eq3">Equation 3</xref>):</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mstyle>
<mml:mtext>Con</mml:mtext>
<mml:mi>v</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>F<sub>in</sub>
</italic> denotes the input feature map, Conv(&#xb7;) represents a 1 &#xd7; 1 convolutional layer that generates attention weights, <italic>&#x3c3;</italic>(&#xb7;) is the Sigmoid activation function ensuring the attention weights are bounded between 0 and 1, and &#x2299; denotes element-wise multiplication, enabling spatial modulation of <italic>F<sub>in</sub>
</italic>.</p>
<p>To preserve the original information flow, AnC2f incorporates a shortcut branch that directly propagates the input features without attention modulation. The outputs of the main and shortcut branches are then fused through element-wise addition, as defined in (<xref ref-type="disp-formula" rid="eq4">Equation 4</xref>):</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>f</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>F<sub>shortcut</sub>
</italic> refers to the identity feature path, facilitating information retention and gradient propagation, while <italic>F<sub>AnC</sub>
</italic>
<sub>2</sub>
<italic>
<sub>f</sub>
</italic> represents the attention-refined features from the main branch. This residual fusion mechanism ensures that the enhanced attention features are complemented by the original unaltered information, leading to richer and more robust representations.</p>
<p>By seamlessly integrating attention-driven enhancement with lightweight computational design, AnC2f effectively improves multiscale feature learning, contributing to the overall performance of CNATNet while maintaining high efficiency.</p>
<p>AnC2f divides input features into parallel paths, applying convolutional and attention operations independently before recombining them via residual connections. Attention blocks are stacked in parallel to progressively improve the model&#x2019;s sensitivity to important spatial regions and morphological patterns (<xref ref-type="bibr" rid="B4">Ding et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B38">Yin et&#xa0;al., 2024</xref>). This enables the network to more effectively capture fine-grained structural details characteristic of safflower filaments.</p>
</sec>
<sec id="s2_3_4">
<label>2.3.4</label>
<title>DWClassify: lightweight classification head</title>
<p>DWClassify functions as the final classification head in CNATNet, aiming to deliver accurate predictions with minimal computational overhead. To this end, DWClassify adopts a depthwise separable convolutional structure, which effectively decouples spatial and channel-wise feature extraction. This design choice significantly reduces both the parameter count and computational complexity compared to conventional convolutional layers (<xref ref-type="bibr" rid="B5">Dwika Hefni Al-Fahsi et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B29">Wang et&#xa0;al., 2023a</xref>).</p>
<p>The parameter complexity of DWClassify is quantified as (<xref ref-type="disp-formula" rid="eq5">Equation 5</xref>):</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>C<sub>in</sub>
</italic> and <italic>C<sub>out</sub>
</italic> denote the input and output channel dimensions, respectively, and <italic>K</italic> represents the kernel size of the depthwise convolution. Specifically, the first term <italic>C<sub>in</sub>
</italic> &#xd7; <italic>K</italic> &#xd7; <italic>K</italic> corresponds to the parameters of the depthwise convolution, which performs spatial filtering independently on each input channel. The second term <italic>C<sub>in</sub>
</italic> &#xd7; <italic>C<sub>out</sub>
</italic> accounts for the pointwise convolution parameters, responsible for inter-channel feature aggregation via 1 &#xd7; 1 convolutions.</p>
<p>In addition to parameter reduction, DWClassify exhibits high computational efficiency. The floating-point operations (FLOPs) required for inference are estimated by (<xref ref-type="disp-formula" rid="eq6">Equation 6</xref>):</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>H</italic> and <italic>W</italic> denote the spatial dimensions of the input feature map. The multiplication with <italic>H</italic> &#xd7; <italic>W</italic> accounts for the per-pixel computation cost across the entire feature map. Similar to the parameter calculation, the first term reflects the spatial filtering cost of the depthwise convolution, while the second term represents the channel-wise aggregation cost incurred by the pointwise convolution.</p>
<p>The architectural details of DWClassify are illustrated in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. By decoupling spatial and channel-wise operations, DWClassify achieves an optimal balance between model compactness and predictive accuracy. This lightweight design not only enhances inference speed but also ensures seamless deployment in resource-constrained environments such as embedded devices and mobile platforms.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Architecture of the DWClassify module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g008.tif">
<alt-text content-type="machine-generated">Diagram of a neural network architecture with two paths. Both paths start with a &#x201c;DWConv&#x201d; layer, followed by a &#x201c;Conv&#x201d; layer. These paths merge into a &#x201c;Conv2d&#x201d; layer, leading to &#x201c;Cls Loss&#x201d;. Arrows indicate data flow.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and analysis</title>
<sec id="s3_1">
<label>3.1</label>
<title>Experimental setup and evaluation metrics</title>
<sec id="s3_1_1">
<label>3.1.1</label>
<title>Experimental environment and parameters</title>
<p>All experiments were conducted on a workstation equipped with an NVIDIA RTX 3070 Ti GPU. The model was implemented using the PyTorch 1.12 framework under a Windows environment. For optimization, the Adam optimizer was employed with a learning rate of 0.001. The training process utilized a batch size of 32 and was conducted for a total of 300 epochs. The specific configuration details are summarized in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Experimental environment configuration.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Component</th>
<th valign="middle" align="center">Configuration</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">GPU</td>
<td valign="middle" align="center">NVIDIA RTX 3070 Ti</td>
</tr>
<tr>
<td valign="middle" align="center">Framework</td>
<td valign="middle" align="center">PyTorch 1.12</td>
</tr>
<tr>
<td valign="middle" align="center">Optimizer</td>
<td valign="middle" align="center">Adam</td>
</tr>
<tr>
<td valign="middle" align="center">Learning Rate</td>
<td valign="middle" align="center">0.001</td>
</tr>
<tr>
<td valign="middle" align="center">Batch Size</td>
<td valign="middle" align="center">32</td>
</tr>
<tr>
<td valign="middle" align="center">Epochs</td>
<td valign="middle" align="center">300</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_1_2">
<label>3.1.2</label>
<title>Evaluation metrics</title>
<p>To comprehensively evaluate the performance of the proposed CNATNet model, this study adopts four core metrics: floating point operations (FLOPs), number of parameters (Params), accuracy (ACC), and latency. These metrics jointly assess the computational efficiency, model complexity, prediction precision, and real-time inference capability of the network.</p>
<p>FLOPs measure the computational complexity required for a single forward pass, indicating the model&#x2019;s resource consumption. The total FLOPs are computed based on the number of operations performed per spatial location and aggregated over all feature maps, as formulated in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>:</p>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mstyle>
<mml:mtext>FLOPs</mml:mtext>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>L</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>L</italic> denotes the total number of convolutional layers, <italic>H<sub>l</sub>
</italic> and <italic>W<sub>l</sub>
</italic> represent the spatial dimensions of the <italic>l</italic>-th layer&#x2019;s feature map, <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are the input and output channel numbers, and <italic>K</italic> is the kernel size.</p>
<p>Params refer to the total count of learnable parameters within the model, directly reflecting its memory footprint. The calculation is expressed in <xref ref-type="disp-formula" rid="eq8">Equation 8</xref>:</p>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mstyle>
<mml:mtext>Params</mml:mtext>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>L</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where the first term accounts for convolutional weights and the second term represents biases.</p>
<p>Accuracy (ACC) measures the ratio of correctly classified samples to the total number of samples in the test set, providing an intuitive evaluation of the model&#x2019;s classification capability. The formula is shown in <xref ref-type="disp-formula" rid="eq9">Equation 9</xref>:</p>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mstyle>
<mml:mtext>ACC</mml:mtext>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>TP</italic>, <italic>TN</italic>, <italic>FP</italic>, and <italic>FN</italic> denote true positives, true negatives, false positives, and false negatives, respectively.</p>
<p>Latency quantifies the average time taken to process a single input image during inference. This metric reflects the practical deployment capability of the model, particularly in real-time scenarios. The latency is defined in <xref ref-type="disp-formula" rid="eq10">Equation 10</xref>:</p>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:mstyle>
<mml:mtext>Latency</mml:mtext>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>T<sub>total</sub>
</italic> represents the total inference time across all test samples, and <italic>N<sub>samples</sub>
</italic> is the number of test samples.</p>
</sec>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Experimental results and analysis</title>
<p>To comprehensively evaluate the proposed model&#x2019;s effectiveness in real-world safflower sorting scenarios, we divided the classification task into two distinct levels: cluster classification and monomer classification. The cluster-level task simulates coarsegrained recognition of densely packed safflower clusters, where the model must make a global judgment based on group-level visual cues. This setup reflects typical conditions in automated harvesting or packaging lines, where safflower bundles are processed in bulk. In contrast, the monomer-level task focuses on fine-grained classification of individual filaments, emphasizing subtle morphological differences such as color, curvature, and integrity. This setting aligns with higher-precision quality control scenarios, such as pharmaceutical sorting or premium product filtering. By evaluating both levels independently, we aim to demonstrate the robustness and generalizability of CNATNet across varied granularities of visual complexity.</p>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Cluster classification results</title>
<p>To evaluate the effectiveness of different models in identifying the overall quality of densely packed safflower clusters, we formulated a cluster-level classification task. In this setting, the input images consist of multiple overlapping filaments, simulating the typical appearance of harvested safflower in agricultural processing lines. A comprehensive comparison was conducted across a wide set of models, including CNATNet, YOLOv5s, YOLOX (<xref ref-type="bibr" rid="B7">Ge et&#xa0;al., 2021</xref>), DAMO-YOLO (<xref ref-type="bibr" rid="B36">Xu et&#xa0;al., 2022b</xref>), PP-YOLOE (<xref ref-type="bibr" rid="B35">Xu et&#xa0;al., 2022a</xref>), YOLOv6 (<xref ref-type="bibr" rid="B12">Li et&#xa0;al., 2022</xref>), YOLOv7 (<xref ref-type="bibr" rid="B31">Wang et&#xa0;al., 2023b</xref>), YOLOv8n/s/m, YOLOv10n/s/m (<xref ref-type="bibr" rid="B30">Wang et&#xa0;al., 2024</xref>), YOLOv11n/s/m, RT-DETRv2s (<xref ref-type="bibr" rid="B18">Lv et&#xa0;al., 2024</xref>), and RF-DETR-B. The complete quantitative results are summarized in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. The visual detection results of CNATNet for cluster classification are shown in <xref ref-type="fig" rid="f9">
<bold>Figure 9</bold>
</xref>.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Cluster-level classification visualization with CNATNet.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g009.tif">
<alt-text content-type="machine-generated">Six piles of saffron strands are displayed on a woven surface. The top row, labeled &#x201c;Premium,&#x201d; shows three piles of vibrant red and yellow strands. The bottom row, labeled &#x201c;Normal,&#x201d; shows three piles that are slightly less vibrant.</alt-text>
</graphic>
</fig>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Quantitative results of the cluster classification experiment.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">FLOPs (B)</th>
<th valign="middle" align="center">Params (M)</th>
<th valign="middle" align="center">Accuracy (%)</th>
<th valign="middle" align="center">Latency (ms)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">CNATNet</td>
<td valign="middle" align="center">4.6</td>
<td valign="middle" align="center">9.8</td>
<td valign="middle" align="center">98.6</td>
<td valign="middle" align="center">1.9</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5s</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">2.4</td>
<td valign="middle" align="center">85.2</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOXs</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">91.4</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">DAMO-YOLO-1</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">84.4</td>
<td valign="middle" align="center">1.3</td>
</tr>
<tr>
<td valign="middle" align="center">PP-YOLOE+s</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">85.6</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv6-3.0n</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">85.4</td>
<td valign="middle" align="center">1.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv7l</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">93.5</td>
<td valign="middle" align="center">4.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8n</td>
<td valign="middle" align="center">0.5</td>
<td valign="middle" align="center">2.7</td>
<td valign="middle" align="center">85.0</td>
<td valign="middle" align="center">1.1</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8s</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">6.4</td>
<td valign="middle" align="center">88.0</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8m</td>
<td valign="middle" align="center">5.3</td>
<td valign="middle" align="center">17.0</td>
<td valign="middle" align="center">91.3</td>
<td valign="middle" align="center">3.3</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10n</td>
<td valign="middle" align="center">0.5</td>
<td valign="middle" align="center">1.6</td>
<td valign="middle" align="center">89.4</td>
<td valign="middle" align="center">1.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10s</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">5.8</td>
<td valign="middle" align="center">91.6</td>
<td valign="middle" align="center">1.6</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10m</td>
<td valign="middle" align="center">5.1</td>
<td valign="middle" align="center">11.7</td>
<td valign="middle" align="center">93.5</td>
<td valign="middle" align="center">2.9</td>
</tr>
<tr>
<td valign="middle" align="center">YOLO11n</td>
<td valign="middle" align="center">0.5</td>
<td valign="middle" align="center">1.6</td>
<td valign="middle" align="center">89.4</td>
<td valign="middle" align="center">1.1</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv11s</td>
<td valign="middle" align="center">1.6</td>
<td valign="middle" align="center">5.5</td>
<td valign="middle" align="center">92.2</td>
<td valign="middle" align="center">1.4</td>
</tr>
<tr>
<td valign="middle" align="center">YOLO11m</td>
<td valign="middle" align="center">5.1</td>
<td valign="middle" align="center">10.4</td>
<td valign="middle" align="center">93.9</td>
<td valign="middle" align="center">2.1</td>
</tr>
<tr>
<td valign="middle" align="center">RT-DETRv2s</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">20.0</td>
<td valign="middle" align="center">86.4</td>
<td valign="middle" align="center">2.3</td>
</tr>
<tr>
<td valign="middle" align="center">RF-DETR-B</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">19.7</td>
<td valign="middle" align="center">95.8</td>
<td valign="middle" align="center">2.2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in the table, CNATNet achieved the highest classification accuracy (98.6%) while maintaining low latency (1.9 ms) and moderate model size (9.8 M parameters, 4.6 B FLOPs), highlighting its superior balance between performance and computational efficiency. In contrast, lightweight variants such as YOLOv8n and YOLOv10n offered faster inference but at the expense of accuracy, while larger-scale models such as YOLOv7l and YOLOv11m provided competitive accuracy but with considerably higher latency. Transformer-based approaches (RT-DETRv2s and RF-DETR-B) demonstrated stronger representational capacity, with RF-DETR-B achieving the second-highest accuracy (95.8%) but requiring higher model complexity.</p>
<p>It should be noted that for several comparative models, including YOLOX, DAMO-YOLO, PP-YOLOE, YOLOv6, YOLOv7, RT-DETRv2, and RF-DETR, FLOPs and parameter counts were not reported in their original publications or repositories, and are thus marked as &#x201c;&#x2013;&#x201d; in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. Since accuracy and latency are available for all methods, the comparative evaluation remains fair and informative.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Monomer classification results</title>
<p>To further evaluate the models on fine-grained morphological recognition, a filament-level classification task was formulated. In this setting, each input image contained a single isolated safflower filament, requiring the models to identify subtle visual cues such as color, curvature, and integrity. A comprehensive comparison was performed across CNATNet, YOLOv5s, YOLOX (<xref ref-type="bibr" rid="B7">Ge et&#xa0;al., 2021</xref>), DAMO-YOLO (<xref ref-type="bibr" rid="B36">Xu et&#xa0;al., 2022b</xref>), PP-YOLOE (<xref ref-type="bibr" rid="B35">Xu et&#xa0;al., 2022a</xref>), YOLOv6 (<xref ref-type="bibr" rid="B12">Li et&#xa0;al., 2022</xref>), YOLOv7 (<xref ref-type="bibr" rid="B31">Wang et&#xa0;al., 2023b</xref>), YOLOv8n/s/m, YOLOv10n/s/m (<xref ref-type="bibr" rid="B30">Wang et&#xa0;al., 2024</xref>), YOLOv11n/s/m, RT-DETRv2s (<xref ref-type="bibr" rid="B18">Lv et&#xa0;al., 2024</xref>), and RF-DETR-B, all trained and tested under identical conditions. The complete quantitative results are reported in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. The visual detection results of CNATNet for monomer classification are shown in <xref ref-type="fig" rid="f10">
<bold>Figure 10</bold>
</xref>.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Filament-level classification visualization with CNATNet.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g010.tif">
<alt-text content-type="machine-generated">Four saffron threads are displayed on a wooden surface. The top two are labeled &#x201c;Premium&#x201d; with more vibrant color and larger size, while the bottom two are labeled &#x201c;Normal&#x201d; and are smaller and less vivid.</alt-text>
</graphic>
</fig>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Quantitative results of the filament classification experiment.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">FLOPs (B)</th>
<th valign="middle" align="center">Params (M)</th>
<th valign="middle" align="center">Accuracy (%)</th>
<th valign="middle" align="center">Latency (ms)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">CNATNet</td>
<td valign="middle" align="center">4.6</td>
<td valign="middle" align="center">9.8</td>
<td valign="middle" align="center">95.6</td>
<td valign="middle" align="center">1.9</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5s</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">2.4</td>
<td valign="middle" align="center">82.6</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOXs</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">88.7</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">DAMO-YOLO-1</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">81.5</td>
<td valign="middle" align="center">1.3</td>
</tr>
<tr>
<td valign="middle" align="center">PP-YOLOE+s</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">82.9</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv6-3.0n</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">83.0</td>
<td valign="middle" align="center">1.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv7l</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">91.0</td>
<td valign="middle" align="center">4.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8n</td>
<td valign="middle" align="center">0.5</td>
<td valign="middle" align="center">2.7</td>
<td valign="middle" align="center">82.3</td>
<td valign="middle" align="center">1.1</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8s</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">6.4</td>
<td valign="middle" align="center">85.4</td>
<td valign="middle" align="center">1.5</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8m</td>
<td valign="middle" align="center">5.3</td>
<td valign="middle" align="center">17.0</td>
<td valign="middle" align="center">88.6</td>
<td valign="middle" align="center">3.3</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10n</td>
<td valign="middle" align="center">0.5</td>
<td valign="middle" align="center">1.6</td>
<td valign="middle" align="center">86.5</td>
<td valign="middle" align="center">1.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10s</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">5.8</td>
<td valign="middle" align="center">89.2</td>
<td valign="middle" align="center">1.6</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10m</td>
<td valign="middle" align="center">5.1</td>
<td valign="middle" align="center">11.7</td>
<td valign="middle" align="center">91.0</td>
<td valign="middle" align="center">2.9</td>
</tr>
<tr>
<td valign="middle" align="center">YOLO11n</td>
<td valign="middle" align="center">0.5</td>
<td valign="middle" align="center">1.6</td>
<td valign="middle" align="center">85.8</td>
<td valign="middle" align="center">1.1</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv11s</td>
<td valign="middle" align="center">1.6</td>
<td valign="middle" align="center">5.5</td>
<td valign="middle" align="center">88.3</td>
<td valign="middle" align="center">1.4</td>
</tr>
<tr>
<td valign="middle" align="center">YOLO11m</td>
<td valign="middle" align="center">5.1</td>
<td valign="middle" align="center">10.4</td>
<td valign="middle" align="center">86.4</td>
<td valign="middle" align="center">2.1</td>
</tr>
<tr>
<td valign="middle" align="center">RT-DETRv2s</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">20.0</td>
<td valign="middle" align="center">83.8</td>
<td valign="middle" align="center">2.3</td>
</tr>
<tr>
<td valign="middle" align="center">RF-DETR-B</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">19.7</td>
<td valign="middle" align="center">92.1</td>
<td valign="middle" align="center">2.2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in the table, CNATNet achieved the highest accuracy (95.6%) with a low inference latency (1.9 ms) and moderate model complexity (9.8 M parameters, 4.6 B FLOPs), confirming its ability to balance efficiency and precision in fine-grained filament recognition. In contrast, lightweight models such as YOLOv8n and YOLOv10n delivered faster inference but suffered noticeable accuracy drops, whereas larger models like YOLOv7l and YOLOv10m achieved higher accuracy at the cost of increased latency. Transformer-based architectures exhibited strong representational power, with RF-DETR-B reaching the second-highest accuracy (92.1%) but with substantially larger model size and computational requirements.</p>
<p>It should be emphasized that for several comparative models, including YOLOX, DAMO-YOLO, PP-YOLOE, YOLOv6, YOLOv7, RTDETRv2, and RF-DETR, FLOPs and parameter counts were not reported in their original papers or official repositories. These values are therefore omitted (&#x201c;&#x2013;&#x201d;) in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. Nevertheless, since both accuracy and latency are consistently available, the comparative evaluation remains comprehensive and fair.</p>
</sec>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Ablation study on structural components</title>
<p>An ablation study was conducted to investigate the individual and joint contributions of the C2S2, AnC2f, and DWClassify modules to the overall performance of CNATNet, the results are shown in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>. The baseline model (M1) was constructed by removing all three proposed modules, employing a conventional convolutional residual block as the backbone, a simplified C2f neck without attention mechanisms, and a dense convolutional layer stack as the classification head. This configuration resulted in 6.4M parameters, 1.7B FLOPs, and 1.4,ms latency, achieving a classification accuracy of 86.2%.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Ablation study on structural components of CNATNet.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">C2S2</th>
<th valign="middle" align="center">AnC2f</th>
<th valign="middle" align="center">DWClassify</th>
<th valign="middle" align="center">FLOPs (B)</th>
<th valign="middle" align="center">Params (M)</th>
<th valign="middle" align="center">Acc (%)</th>
<th valign="middle" align="center">Latency (ms)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">M1 (Baseline)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">1.7</td>
<td valign="middle" align="center">6.4</td>
<td valign="middle" align="center">86.2</td>
<td valign="middle" align="center">1.4</td>
</tr>
<tr>
<td valign="middle" align="center">M2 (+C2S2)</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">1.4</td>
<td valign="middle" align="center">6.1</td>
<td valign="middle" align="center">86.5</td>
<td valign="middle" align="center">1.1</td>
</tr>
<tr>
<td valign="middle" align="center">M3 (+AnC2f)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">5.1</td>
<td valign="middle" align="center">10.5</td>
<td valign="middle" align="center">95.4</td>
<td valign="middle" align="center">2.3</td>
</tr>
<tr>
<td valign="middle" align="center">M4 (+DWClassify)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">1.5</td>
<td valign="middle" align="center">6.0</td>
<td valign="middle" align="center">86.1</td>
<td valign="middle" align="center">1.0</td>
</tr>
<tr>
<td valign="middle" align="center">M6 (+C2S2+AnC2f)</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">4.9</td>
<td valign="middle" align="center">10.1</td>
<td valign="middle" align="center">95.5</td>
<td valign="middle" align="center">2.5</td>
</tr>
<tr>
<td valign="middle" align="center">M7 (+C2S2+DWClassify)</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">1.2</td>
<td valign="middle" align="center">5.8</td>
<td valign="middle" align="center">86.1</td>
<td valign="middle" align="center">0.8</td>
</tr>
<tr>
<td valign="middle" align="center">M8 (+AnC2f+DWClassify)</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">5.0</td>
<td valign="middle" align="center">10.1</td>
<td valign="middle" align="center">95.1</td>
<td valign="middle" align="center">2.4</td>
</tr>
<tr>
<td valign="middle" align="center">M9 (CNATNet)</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">4.6</td>
<td valign="middle" align="center">9.8</td>
<td valign="middle" align="center">95.6</td>
<td valign="middle" align="center">1.9</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Model M2 introduced the lightweight C2S2 backbone, which reduced FLOPs from 1.7B to 1.4B and parameters from 6.4M to 6.1M, with latency reduced to 1.1,ms. Accuracy slightly increased to 86.5%, showing improved efficiency but limited gains in representational power.</p>
<p>Model M3 integrated the AnC2f attention-enhanced fusion module. This markedly improved accuracy to 95.4%, demonstrating its effectiveness in capturing multiscale structural information. However, the added attention increased parameters to 10.5M, FLOPs to 5.1B, and latency to 2.3,ms.</p>
<p>Model M4 replaced the dense classifier with the proposed DWClassify head. This lightweight adjustment reduced parameters to 6.0M and FLOPs to 1.5B, achieving the lowest latency of 1.0,ms. Accuracy was comparable to the baseline (86.1%), highlighting DWClassify&#x2019;s role in efficiency rather than accuracy enhancement.</p>
<p>When combining modules, Model M6 (C2S2+AnC2f) achieved 95.5% accuracy with 10.1M parameters and 4.9B FLOPs, showing that C2S2 complements AnC2f by reducing part of its computational burden. Model M7 (C2S2+DWClassify) provided the most efficient setting, with only 5.8M parameters, 1.2B FLOPs, and 0.8,ms latency, though accuracy remained at 86.1%. Model M8 (AnC2f+DWClassify) yielded 95.1% accuracy with 10.1M parameters, 5.0B FLOPs, and 2.4,ms latency, achieving a balance between accuracy and efficiency compared to using AnC2f alone.</p>
<p>Finally, the complete CNATNet (M9), which integrates all three modules, demonstrated the best trade-off between accuracy and efficiency. It achieved 95.6% accuracy with 9.8M parameters, 4.6B FLOPs, and 1.9,ms latency. Compared with M3 (highest accuracy but heavy) and M7 (highest efficiency but low accuracy), CNATNet effectively balances both aspects, confirming the synergistic contribution of C2S2, AnC2f, and DWClassify.</p>
<p>Overall, the ablation results confirm that each proposed module contributes uniquely: C2S2 improves backbone efficiency, AnC2f significantly enhances feature fusion and accuracy, and DWClassify ensures lightweight classification. Their joint integration enables CNATNet to achieve superior performance while maintaining low computational overhead, meeting the requirements of automated safflower filament classification tasks.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Embedded deployment and system implementation on Jetson Orin Nano</title>
<p>With the increasing computational capabilities of modern embedded AI hardware, edge devices have become viable platforms for deploying deep learning models in real-world agricultural applications. To evaluate the practicality of the proposed model in resource-constrained scenarios, we deployed our lightweight classification network, CNATNet, on the Jetson Orin Nano platform. This deployment demonstrates the model&#x2019;s suitability for real-time, on-device safflower filament grading without reliance on high-end GPU servers. The Jetson Orin Nano, based on the ARM architecture and optimized for AI inference tasks, provides a compelling balance between energy efficiency and computational performance. A visual overview of the deployed system is presented in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>, while detailed hardware specifications are summarized in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Jetson Orin Nano physical image.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g011.tif">
<alt-text content-type="machine-generated">NVIDIA Jetson Orin Nano Module diagram featuring labeled components: Ampere GPU, Arm Cortex-A78AE CPU, 8GB RAM, microSD card slot, cooling fan, 40-pin expansion header, power LED, USB-C, Gigabit Ethernet, 4 USB 3.2 Type A ports, DisplayPort, DC barrel jack, dual MIPI CSI camera ports, and IEEE 802.11 WiFi with Bluetooth.</alt-text>
</graphic>
</fig>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Key specifications of Jetson Orin Nano (8GB).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Name</th>
<th valign="middle" align="center">Jetson Orin Nano (8GB)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">CPU</td>
<td valign="middle" align="center">6-core ARM Cortex-A78AE @ 1.7GHz</td>
</tr>
<tr>
<td valign="middle" align="center">GPU</td>
<td valign="middle" align="center">32-core NVIDIA Ampere, 67 TOPS AI compute</td>
</tr>
<tr>
<td valign="middle" align="center">Memory</td>
<td valign="middle" align="center">8GB LPDDR5, 102.4GB/s</td>
</tr>
<tr>
<td valign="middle" align="center">Storage</td>
<td valign="middle" align="center">128GB external NVMe SSD</td>
</tr>
<tr>
<td valign="middle" align="center">Network</td>
<td valign="middle" align="center">1 &#xd7; Gigabit Ethernet</td>
</tr>
<tr>
<td valign="middle" align="center">USB</td>
<td valign="middle" align="center">4 &#xd7; USB 3.2 Gen2</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s3_4_1">
<label>3.4.1</label>
<title>Model deployment</title>
<p>To support intelligent plant recognition in embedded agricultural scenarios, this study adopts a coarse-to-fine classification framework. Specifically, a cluster-level (coarse) classification is first performed to quickly filter and group safflower samples, followed by a fine-grained filament-level recognition to achieve precise grading. The proposed CNATNet model, which integrates lightweight convolutional and attention mechanisms, was initially trained and optimized on a high-performance local workstation. The final optimized version was then deployed to the Jetson Orin Nano platform for real-time on-device inference. This deployment not only verifies the model&#x2019;s efficiency and robustness under low-power constraints, but also highlights the advantage of its lightweight design, which enables stable inference on embedded hardware at 63.29 FPS with only 15 W power consumption. These results demonstrate the potential of CNATNet in enabling intelligent, embedded plant classification for modern agricultural systems.</p>
</sec>
<sec id="s3_4_2">
<label>3.4.2</label>
<title>Test results and analysis</title>
<p>In order to evaluate the real-time performance of the CNATNet-based safflower classification system on the Jetson Orin Nano platform, a live camera-based testing method is employed, enabling real-time recognition of safflower clusters and filaments under practical deployment conditions. The overall process is illustrated in <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref>, where the camera captures input images, which are then processed by the deployed model for on-device inference. The classification results are displayed in real time, demonstrating the effectiveness of the system in practical scenarios. Specifically, subfigure (A) presents the actual on-device deployment setup, including the Jetson Orin Nano, camera, and display screen, while subfigure (B) shows the corresponding real-time prediction results with the classified safflower grade, confirming that the lightweight CNATNet model achieves both accuracy and efficiency in embedded agricultural environments.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Visual demonstration of real-time safflower classification using CNATNet deployed on Jetson Orin Nano. <bold>(A)</bold> On-device inference setup. <bold>(B)</bold> Prediction result: <italic>PREMIUM</italic> grade.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1639269-g012.tif">
<alt-text content-type="machine-generated">A computer monitor displays an image of saffron on a wooden surface. Below, a table holds a small robot with wires attached, positioned near the monitor. A close-up view shows a pile of saffron threads on a textured surface, labeled with the word &#x201c;Premium."</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusion</title>
<p>In this study, a lightweight hybrid network, CNATNet, was proposed for safflower filament classification. The architecture integrates multi-branch convolutional feature extraction, attention-enhanced fusion, and a lightweight classification head, achieving a balance between accuracy and computational efficiency.</p>
<p>Experimental evaluations showed that CNATNet achieved 95.6% classification accuracy, with markedly reduced parameters, floatingpoint operations, and inference latency. These results confirm that the proposed lightweight design meets the practical requirements of real-time deployment in resource-constrained agricultural environments. Furthermore, deployment on the Jetson Orin Nano platform demonstrated stable real-time performance at low power, validating its suitability for embedded agricultural grading tasks. The lightweight design principles adopted in CNATNet provide a feasible solution for fine-grained quality assessment, with potential applications extending beyond safflower classification to other agricultural and industrial scenarios.</p>
<p>Nevertheless, challenges remain under complex environmental conditions such as variable illumination, occlusion, and background interference, which may affect robustness. Future work will focus on improving generalization through adaptive illumination normalization, domain-specific data augmentation, and lightweight multimodal fusion strategies.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>PM: Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Conceptualization, Investigation, Methodology. NL: Data curation, Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. LD: Data curation, Validation, Writing &#x2013; review &amp; editing. YL: Data curation, Visualization, Writing &#x2013; review &amp; editing. ZS: Data curation, Resources, Writing &#x2013; review &amp; editing. YZ: Formal analysis, Visualization, Writing &#x2013; review &amp; editing. ZC: Resources, Writing &#x2013; review &amp; editing. JZ: Funding acquisition, Project administration, Supervision, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research and/or publication of this article. This work was funded by the New Generation Artificial Intelligence Major Project of National Key RD Program of China (Grant number: 2022ZD0115803).</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>The authors thank all those who helped in the course of this research.</p>
</ack>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Qu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Thiele</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Knoll</surname> <given-names>A. C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Ghostvit: Expediting vision transformers via cheap operations</article-title>. <source>IEEE Trans. Artif. Intell.</source> <volume>5</volume>, <fpage>2517</fpage>&#x2013;<lpage>2525</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAI.2023.3326795</pub-id>
</citation></ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ning</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>a). <article-title>A method for real-time recognition of safflower filaments in unstructured environments using the yolo-safi model</article-title>. <source>Sensors (Basel Switzerland)</source> <volume>24</volume>, <elocation-id>4410</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s24134410</pub-id>, PMID: <pub-id pub-id-type="pmid">39001189</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>b). <article-title>Lightweight and fast visual detection method for 3c assembly</article-title>. <source>Displays</source> <volume>82</volume>, <fpage>102631</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.displa.2023.102631</pub-id>
</citation></ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Bruzzone</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Lanet: Local attention embedding to improve the semantic segmentation of remote sensing images</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>59</volume>, <fpage>426</fpage>&#x2013;<lpage>435</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2020.2994150</pub-id>
</citation></ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dwika Hefni Al-Fahsi</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Naghim Fauzaini Prawirosoenoto</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Adi Nugroho</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ardiyanto</surname> <given-names>I.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Givted-net: Ghostnet-mobile involution vit encoder-decoder network for lightweight medical image segmentation</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>81281</fpage>&#x2013;<lpage>81292</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2024.3411870</pub-id>
</citation></ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Multiscale residual network with mixed depthwise convolution for hyperspectral image classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>59</volume>, <fpage>3396</fpage>&#x2013;<lpage>3408</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2020.3008286</pub-id>
</citation></ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ge</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Yolox: Exceeding yolo series in 2021</article-title>. <source>arXiv preprint arXiv:2107.08430</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2107.08430</pub-id>
</citation></ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Howard</surname> <given-names>A. G.</given-names>
</name>
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). &#x201c;<article-title>Searching for mobilenetv3</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>, <conf-loc>Seoul, Korea (South)</conf-loc>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> <fpage>1314</fpage>&#x2013;<lpage>1324</lpage>.</citation></ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>D-tldetector: Advancing traffic light detection with a lightweight deep learning model</article-title>. <source>IEEE Trans. Intelligent Transportation Syst.</source> <volume>26</volume>, <fpage>3917</fpage>&#x2013;<lpage>3933</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TITS.2024.3522195</pub-id>
</citation></ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karada&#x11f;</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Computer aided decision making to use optimum water in safflower growing</article-title>. <source>Emirates J. Food Agric.</source> <volume>34</volume>, <fpage>743</fpage>&#x2013;<lpage>749</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.9755/ejfa.2022.v34.i9.2948</pub-id>
</citation></ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lau</surname> <given-names>K. W.</given-names>
</name>
<name>
<surname>Rehman</surname> <given-names>Y. A. U.</given-names>
</name>
<name>
<surname>Po</surname> <given-names>L.-M.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Audiorepinceptionnext: A lightweight single-stream architecture for efficient audio recognition</article-title>. <source>Neurocomputing</source> <volume>578</volume>, <fpage>127432</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2024.127432</pub-id>
</citation></ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Weng</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Geng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Yolov6: A single-stage object detection framework for industrial applications</article-title>. <source>arXiv preprint arXiv:2209.02976</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2209.02976</pub-id>
</citation></ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Duan</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Atkinson</surname> <given-names>P. M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Abcnet: Attentive bilateral contextual network for efficient semantic segmentation of fine-resolution remotely sensed imagery</article-title>. <source>ISPRS Journal of Photogrammetry and Remote Sensing</source>. <volume>181</volume>, <page-range>84&#x2013;98</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isprsjprs.2021.09.005</pub-id>
</citation></ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Machine learning-assisted sers sensor for fast and ultrasensitive analysis of multiplex hazardous dyes in natural products</article-title>. <source>J. hazardous materials</source> <volume>482</volume>, <elocation-id>136584</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jhazmat.2024.136584</pub-id>, PMID: <pub-id pub-id-type="pmid">39579701</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Qiao</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>A rapid analysis method of safflower (carthamus tinctorius l.) using combination of computer vision and near-infrared</article-title>. <source>Spectrochimica Acta Part A Mol. biomolecular Spectrosc.</source> <volume>236</volume>, <fpage>118360</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.saa.2020.118360</pub-id>, PMID: <pub-id pub-id-type="pmid">32330825</pub-id></citation></ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Abeyrathna</surname> <given-names>R. M. R. D.</given-names>
</name>
<name>
<surname>Sampurno</surname> <given-names>R. M.</given-names>
</name>
<name>
<surname>Nakaguchi</surname> <given-names>V. M.</given-names>
</name>
<name>
<surname>Ahamed</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Faster-yolo-ap: A lightweight apple detection algorithm based on improved yolov8 with a new efficient pdwconv in orchard</article-title>. <source>Comput. Electron. Agric.</source> <volume>223</volume>, <fpage>109118</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109118</pub-id>
</citation></ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Scvit: A spatial-channel feature preserving vision transformer for remote sensing image scene classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>PP</volume>, <fpage>1</fpage>&#x2013;<lpage>1</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2022.3157671</pub-id>
</citation></ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Rt-detrv2: Improved baseline with bag-of-freebies for real-time detection transformer</article-title>. <source>arXiv preprint arXiv:2407.17140</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2407.17140</pub-id>
</citation></ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Serpensgate-yolov8: an enhanced yolov8 model for accurate plant disease detection</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1514832</pub-id>, PMID: <pub-id pub-id-type="pmid">39902212</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Special: Zero-shot hyperspectral image classification with clip</article-title>. <source>arXiv preprint arXiv:2501.16222</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2501.16222</pub-id>
</citation></ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yue</surname> <given-names>S.-j.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>G.-S.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>X.-q.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Z.-h.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>The comprehensive evaluation of safflowers in different producing areas by combined analysis of color, chemical compounds, and biological activity</article-title>. <source>Molecules</source> <volume>24</volume>, <elocation-id>3381</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/molecules24183381</pub-id>, PMID: <pub-id pub-id-type="pmid">31533325</pub-id></citation></ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qing</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Improved yolo-fastestv2 wheat spike detection model based on a multi-stage attention mechanism with a lightfpn detection head</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1411510</pub-id>, PMID: <pub-id pub-id-type="pmid">38962247</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ryparova Kvirencova</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Navratilova</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Hrbek</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Hajslova</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Detection of botanical adulterants in saffron powder</article-title>. <source>Analytical Bioanalytical Chem.</source> <volume>415</volume>, <fpage>5273</fpage>&#x2013;<lpage>5734</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00216-023-04853-x</pub-id>, PMID: <pub-id pub-id-type="pmid">37587313</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Salek</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Mireei</surname> <given-names>S. A.</given-names>
</name>
<name>
<surname>Hemmat</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Jafari</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sabzalian</surname> <given-names>M. R.</given-names>
</name>
<name>
<surname>Nazeri</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Early monitoring of drought stress in safflower (carthamus tinctorius l.) using hyperspectral imaging: a comparison of machine learning tools and feature selection approaches</article-title>. <source>Plant Stress</source> <volume>14</volume>, <elocation-id>100653</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.stress.2024.100653</pub-id>
</citation></ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname> <given-names>A. G.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhmoginov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Mobilenetv2: Inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Salt Lake City, UT, USA</conf-loc>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>.</citation></ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Comparative analysis of acanthopanacis cortex and periplocae cortex using an electronic nose and gas chromatography&#x2013;mass spectrometry coupled with multivariate statistical analysis</article-title>. <source>Molecules</source> <volume>27</volume>, <elocation-id>8964</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/molecules27248964</pub-id>, PMID: <pub-id pub-id-type="pmid">36558097</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Van</surname> <given-names>H. T.</given-names>
</name>
<name>
<surname>Khuat</surname> <given-names>P. T.</given-names>
</name>
<name>
<surname>Van</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Tuan</surname> <given-names>T. T.</given-names>
</name>
<name>
<surname>Chung</surname> <given-names>Y. S.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A deep learning method for differentiating safflower germplasm using optimal leaf structure features</article-title>. <source>Ecol. Inf.</source> <volume>85</volume>, <fpage>102998</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2025.102998</pub-id>
</citation></ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Varghese</surname> <given-names>R.</given-names>
</name>
<name>
<surname>M</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Yolov8: A novel object detection algorithm with enhanced performance and robustness</article-title>,&#x201d; in <conf-name>2024 International Conference on Advances in Data Engineering and Intelligent Computing Systems (ADICS)</conf-name>, <conf-loc>Chennai, India</conf-loc>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation></ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Rep vit: Revisiting mobile cnn from vit perspective</article-title>,&#x201d; in <conf-name>2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Seattle, WA, USA</conf-loc>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> Vol. <volume>15909&#x2013;15920</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01506</pub-id>
</citation></ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Yolov10: Real-time end-to-end object detection</article-title>. <source>arXiv preprint arXiv:2405.14458</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2405.14458</pub-id>
</citation></ref>
<ref id="B31">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2023</year>b). &#x201c;<article-title>Yolov7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Vancouver, BC, Canada</conf-loc>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> <fpage>7464</fpage>&#x2013;<lpage>7475</lpage>.</citation></ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
<name>
<surname>Yeh</surname> <given-names>I.-H.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y.-H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.-Y.</given-names>
</name>
<name>
<surname>Hsieh</surname> <given-names>J.-W.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Cspnet: A new backbone that can enhance learning capability of cnn</article-title>,&#x201d; in <conf-name>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</conf-name>, <conf-loc>Seattle, WA, USA</conf-loc>. <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> <fpage>1571</fpage>&#x2013;<lpage>1580</lpage>.</citation></ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A lightweight weed detection model for cotton fields based on an improved yolov8n</article-title>. <source>Sci. Rep.</source> <volume>15</volume>, <fpage>457</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-024-84748-8</pub-id>, PMID: <pub-id pub-id-type="pmid">39747358</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Vegetable disease detection using an improved yolov8 algorithm in the greenhouse plant environment</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>4261</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-024-54540-9</pub-id>, PMID: <pub-id pub-id-type="pmid">38383751</pub-id></citation></ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Cui</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>K.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>a). <article-title>Pp-yoloe: An evolved version of yolo</article-title>. <source>arXiv preprint arXiv:2203.16250</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2203.16250</pub-id>
</citation></ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y.-L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>b). <article-title>Damo-yolo: A report on real-time object detection design</article-title>. <source>arXiv preprint arXiv:2211.15444</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2211.15444</pub-id>
</citation></ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Chanussot</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Spectralmamba: Efficient mamba for hyperspectral image classification</article-title>. <source>arXiv preprint arXiv:2404.08489</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2404.08489</pub-id>
</citation></ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yin</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Convolution-transformer for image feature extraction</article-title>. <source>Comput. Modeling Eng. Sci.</source> <volume>141</volume>, <fpage>87</fpage>&#x2013;<lpage>106</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.32604/cmes.2024.051083</pub-id>
</citation></ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>An improved yolov5 crack detection method combined with a bottleneck transformer</article-title>. <source>Mathematics</source> <volume>11</volume>, <elocation-id>2377</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/math11102377</pub-id>
</citation></ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A quality grade classification method for fresh tea leaves based on an improved yolov8x-sppcspc-cbam model</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>4166</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-024-54389-y</pub-id>, PMID: <pub-id pub-id-type="pmid">38378791</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Rao</surname> <given-names>K. G. M.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Q.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Indistinct assessment of the quality of traditional chinese medicine in precision medicine exampling as safflower</article-title>. <source>J. Pharm. Biomed. Anal.</source> <volume>227</volume>, <fpage>115277</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jpba.2023.115277</pub-id>, PMID: <pub-id pub-id-type="pmid">36736110</pub-id></citation></ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A lightweight network for seismic phase picking on embedded systems</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>85103</fpage>&#x2013;<lpage>85114</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2024.3416034</pub-id>
</citation></ref>
</ref-list>
</back>
</article>