<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1515105</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>AMS-MLP: adaptive multi-scale MLP network with multi-scale context relation decoder for pepper leaf segmentation</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Fang</surname>
<given-names>Jiangxiong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1802922/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Huaxiang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Shiqing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Hui</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gu</surname>
<given-names>Huaqi</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Fu</surname>
<given-names>Youyao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2113075/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Institute of Intelligent Information Processing, Taizhou University</institution>, <addr-line>Taizhou, Zhejiang</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Information and Remote Sensing, Jiangxi Provincial Natural Resources Development Center</institution>, <addr-line>Nanchang, Jiangxi</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Nathaniel K. Newlands, Agriculture and Agri-Food Canada (AAFC), Canada</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Yalin Wu, Peking University, China</p>
<p>Jianlong Wang, Henan Polytechnic University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Youyao Fu, <email xlink:href="mailto:fuyouyao828@126.com">fuyouyao828@126.com</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>08</day>
<month>04</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1515105</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>10</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>17</day>
<month>03</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Fang, Liu, Zhang, Hu, Gu and Fu</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Fang, Liu, Zhang, Hu, Gu and Fu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Pepper leaf segmentation plays a pivotal role in monitoring pepper leaf diseases across diverse backgrounds and ensuring healthy pepper growth. However, existing Transformer-based segmentation methods grapple with computational inefficiency, excessive parameterization, and inadequate utilization of edge information.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address these challenges, this study introduces an Adaptive Multi-Scale MLP (AMS-MLP) framework. This framework integrates the Multi-Path Aggregation Module (MPAM) and the Multi-Scale Context Relation Mask Module (MCRD) to refine object boundaries in pepper leaf segmentation. The AMS-MLP includes an encoder, an Adaptive Multi-Scale MLP (AM-MLP) module, and a decoder. The encoder&#x2019;s MPAM fuses five-scale features for accurate boundary extraction. The AM-MLP splits features into global and local branches, with an adaptive attention mechanism balancing them. The decoder enhances boundary feature extraction using MCRD.</p>
</sec>
<sec>
<title>Results</title>
<p>To validate the proposed method, extensive experiments were conducted on three pepper leaf datasets with varying backgrounds. Results demonstrate mean Intersection over Union (mIoU) scores of 97.39%, 96.91%, and 97.91%, and F1 scores of 98.29%, 97.86%, and 98.51% across the datasets, respectively.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Comparative analysis with U-Net and state-of-the-art models reveals that the proposed method significantly improves the accuracy and efficiency of pepper leaf image segmentation.</p>
</sec>
</abstract>
<kwd-group>
<kwd>pepper leaf segmentation</kwd>
<kwd>multi-scale MLP</kwd>
<kwd>multi-path aggregation module</kwd>
<kwd>context relation mask module</kwd>
<kwd>adaptive attention mechanism</kwd>
</kwd-group>
<counts>
<fig-count count="13"/>
<table-count count="6"/>
<equation-count count="32"/>
<ref-count count="41"/>
<page-count count="19"/>
<word-count count="9267"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Pepper is a crucial crop in global agriculture, with China being the largest producer and consumer, accounting for 37% of the world&#x2019;s pepper planting area. Essential for daily consumption, pepper plants are highly susceptible to diseases, particularly those affecting the leaves, leading to significant economic losses if not promptly detected and controlled (<xref ref-type="bibr" rid="B3">Bhavini and Sheshang, 2015</xref>; <xref ref-type="bibr" rid="B5">Cruz et&#xa0;al., 2019</xref>). In practice, manual identification of disease spots and severity assessment is commonly used by planters; however, this process is labor-intensive and prone to human error (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2017</xref>).</p>
<p>In recent years, deep learning methods, particularly Convolutional Neural Networks (CNNs), have garnered significant attention in the field of plant disease recognition (<xref ref-type="bibr" rid="B31">Pal and Kumar, 2023</xref>; <xref ref-type="bibr" rid="B2">Beikmohammadi et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B28">Naik et&#xa0;al., 2022</xref>). Current research predominantly relies on single-background images (e.g., desktop, human palm) for recognition (<xref ref-type="bibr" rid="B17">He et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B15">Fatima Naqvi et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B16">Fu et&#xa0;al., 2024</xref>), as their stable backgrounds help highlight disease features, thereby improving recognition accuracy. However, single-background images face challenges in practical field applications, where it is often difficult to obtain backgrounds identical to those in the training images, which can lead to degraded model performance (<xref ref-type="bibr" rid="B16">Fu et&#xa0;al., 2024</xref>). Therefore, precise segmentation of diseased leaves to isolate them from complex and diverse backgrounds is crucial for enhancing the robustness and accuracy of recognition systems.</p>
<p>Image segmentation techniques, especially those leveraging advancements in deep learning, provide effective means to extract pepper leaves from images and are foundational for detecting and diagnosing diseases (<xref ref-type="bibr" rid="B7">Deb et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B14">Fang et&#xa0;al., 2021a</xref>; <xref ref-type="bibr" rid="B29">Ngugi et&#xa0;al., 2021</xref>). Traditional segmentation methods such as threshold-based and region-based techniques (<xref ref-type="bibr" rid="B23">Liu, 2012</xref>; <xref ref-type="bibr" rid="B13">Fang et&#xa0;al., 2021b</xref>) have been widely used but are limited by their reliance on image features and their inability to handle complex backgrounds effectively. Deep learning-based methods, especially CNNs and U-Net architectures, have shown promising results in semantic segmentation tasks (<xref ref-type="bibr" rid="B25">Long et&#xa0;al., 2015</xref>; <xref ref-type="bibr" rid="B32">Ronneberger et&#xa0;al., 2015</xref>; <xref ref-type="bibr" rid="B22">Li et&#xa0;al., 2018</xref>), but they often struggle with capturing detailed boundary information or handling multi-scale features.</p>
<p>Transformer-based networks (<xref ref-type="bibr" rid="B10">Dosovitskiy et&#xa0;al., 2020</xref>) have been proposed to address these issues by leveraging the self-attention mechanism, which allows for the extraction of global context information (<xref ref-type="bibr" rid="B4">Chen et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B12">Fang et&#xa0;al., 2023</xref>). However, many of these models focus primarily on global features and overlook detailed boundary information (<xref ref-type="bibr" rid="B40">Zhang et&#xa0;al., 2022</xref>). Multi-layer perceptron (MLP)-based networks, such as the MLP-Mixer (<xref ref-type="bibr" rid="B35">Tolstikhin et&#xa0;al., 2021</xref>), have recently demonstrated the potential to replace attention mechanisms, achieving competitive performance in image segmentation tasks by processing spatial information efficiently (<xref ref-type="bibr" rid="B26">Lv et&#xa0;al., 2022</xref>).</p>
<p>Building on these advancements, We present a novel approach for pepper leaf segmentation, called the adaptive multi-scale MLP (AMS-MLP) network. This network follows an encoder-decoder architecture, integrating the multi-path aggregation mask (MPAM) module with the multi-scale context relation decoder (MCRD) module. To enhance the fusion of global and local information between the encoder and decoder, we introduce the adaptive multi-scale MLP (AM-MLP) module, which replaces traditional skip connection layers. The AM-MLP module overcomes the limitations of convolutional layers&#x2019; inductive biases by effectively handling global information and progressively merging local details. Additionally, the MCRD module strengthens the model&#x2019;s focus on foreground-background boundaries, especially around the segmented edges. Our contributions are as follows:</p>
<list list-type="order">
<list-item>
<p>We propose a novel segmentation framework designed for accurate pepper leaf extraction from complex backgrounds. This framework outperforms previous methods by using a five-layer aggregation feature to generate a single-channel mask, improving segmentation precision along the pepper leaf boundaries.</p>
</list-item>
<list-item>
<p>We introduce the AM-MLP module, based on a self-attention mechanism, to automatically extract multi-scale features. This module consists of two branches: a Global Multi-scale MLP (GMS-MLP) branch and a Local Multi-scale MLP (LMS-MLP) branch, which capture global and local feature maps, respectively. The attention mechanism dynamically adjusts the weight assigned to each, ensuring effective fusion of both.</p>
</list-item>
<list-item>
<p>The MCRD module, leveraging an attention mechanism, combines features across adjacent scales, enhancing boundary delineation and contextual information for the segmented target.</p>
</list-item>
<list-item>
<p>Extensive experiments on the pepper leaf dataset demonstrate that our model outperforms state-of-the-art (SOTA) methods.</p>
</list-item>
</list>
<p>The remainder of the paper is structured as follows: Section 2 reviews related work on semantic segmentation methods. Section 3 details our network architecture. Section 4 describes the experimental setup, and Section 5 presents results and discussion. Finally, Section 6 concludes the paper.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related works</title>
<sec id="s2_1">
<label>2.1</label>
<title>Traditional semantic segmentation methods</title>
<p>Several traditional methods have been proposed for segmenting plant leaf images. Threshold-based techniques, such as fuzzy C-means algorithms (<xref ref-type="bibr" rid="B23">Liu, 2012</xref>), are commonly used to iteratively determine the optimal threshold for leaf image segmentation. Histogram-based thresholding methods, including bimodal histograms and Otsu&#x2019;s Thresholding Method (<xref ref-type="bibr" rid="B20">Kalaivani et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B13">Fang et&#xa0;al., 2021b</xref>), have also been employed for segmenting leaf images. However, these threshold-based methods often struggle with complex images. Region-based approaches, such as the region-based level set method (<xref ref-type="bibr" rid="B14">Fang et&#xa0;al., 2021a</xref>), region growing methods (<xref ref-type="bibr" rid="B19">Jothiaruna et&#xa0;al., 2021</xref>), and wavelet methods (<xref ref-type="bibr" rid="B39">Xiong et&#xa0;al., 2020</xref>), have shown high accuracy and fast processing speeds for plant leaf segmentation. While these methods yield satisfactory results to some extent, their effectiveness is heavily dependent on image features, which limits their broader applicability. Clustering-based methods, such as fuzzy k-means clustering (<xref ref-type="bibr" rid="B34">Tian et&#xa0;al., 2019</xref>), have been used to determine cluster centers for leaf segmentation. However, these methods often struggle with local optima, leading to lower segmentation accuracy.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>CNN-based models for semantic segmentation</title>
<p>Deep learning techniques have revolutionized the field of image segmentation, with convolutional neural networks (CNNs) playing a pivotal role. The introduction of fully convolutional networks (FCN) by <xref ref-type="bibr" rid="B25">Long et&#xa0;al. (2015)</xref> marked a significant milestone, replacing traditional fully connected layers with specialized convolutional layers tailored for segmentation tasks. Building on this, <xref ref-type="bibr" rid="B32">Ronneberger et&#xa0;al. (2015)</xref> proposed the U-Net architecture, which employs an encoder-decoder structure with skip connections to fuse low-level and high-level features. U-Net and its variants, such as R2U-Net (<xref ref-type="bibr" rid="B1">Alom et&#xa0;al., 2018</xref>) and BIONet (<xref ref-type="bibr" rid="B38">Xiang et&#xa0;al., 2020</xref>), have shown strong performance in segmentation, particularly for medical and agricultural applications. However, despite their success, CNN-based methods often face challenges in extracting detailed boundary information, especially in complex and varied environments.</p>
<p>To address these limitations, researchers have incorporated attention mechanisms into CNNs (<xref ref-type="bibr" rid="B30">Oktay et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B41">Zhou et&#xa0;al., 2019</xref>). For example, the squeeze-and-excitation network (SE-Net) (<xref ref-type="bibr" rid="B18">Hu et&#xa0;al., 2018</xref>) uses channel-wise attention to enhance global feature representation, while the attention-guided network (<xref ref-type="bibr" rid="B21">Li et&#xa0;al., 2019</xref>a) focuses on suppressing irrelevant background information. A parallel reverse attention network (PraNet) (<xref ref-type="bibr" rid="B11">Fan et&#xa0;al., 2020</xref>) introduced a reverse attention block to build relationships among object regions and boundaries. Despite their improvements, these models still struggle with precise boundary delineation, especially in complex segmentation tasks such as plant disease recognition.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Transformer-based models for semantic segmentation</title>
<p>Transformer-based models, originally designed for natural language processing (<xref ref-type="bibr" rid="B8">Devlin et&#xa0;al., 2018</xref>), have been adapted for computer vision tasks, including image segmentation. These models use self-attention mechanisms to capture long-range dependencies in images, improving segmentation accuracy for global features. For instance, TransUNet (<xref ref-type="bibr" rid="B4">Chen et&#xa0;al., 2021</xref>) combines the U-Net architecture with transformers to leverage high-level informative features for improved performance. <xref ref-type="bibr" rid="B12">Fang et&#xa0;al. (2023)</xref> proposed BAF-Net, a network combining CNNs and Swin Transformers for plant leaf segmentation. It utilizes MSFF and FSFF branches, enhanced by an adaptive bidirectional attention module, to capture comprehensive features. <xref ref-type="bibr" rid="B6">Dai et&#xa0;al. (2024)</xref> introduce AISOA-SSformer, a Transformer-based segmentation method for rice leaf disease detection. By integrating sparse global updates, feature attention, and optimized algorithms, it achieves high accuracy, aiding modern farming. However, transformer models often focus primarily on global context and struggle with capturing fine-grained details, such as object boundaries.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>MLP-based models for semantic segmentation</title>
<p>Recently, multi-layer perceptron (MLP)-based models have gained attention as a viable alternative to CNNs and transformers for image segmentation. The MLP-Mixer (<xref ref-type="bibr" rid="B35">Tolstikhin et&#xa0;al., 2021</xref>) demonstrated that MLPs could replace self-attention mechanisms in image processing, achieving competitive performance in tasks like image classification. This idea was further explored in the Visual Transformer (ViT) (<xref ref-type="bibr" rid="B27">Melas-Kyriazi, 2021</xref>), where MLPs replaced the attention layers, showing that MLP-based networks could achieve similar results to CNNs and transformers in recognition tasks.</p>
<p>In segmentation tasks, MLP-based models like RepMLPNet (<xref ref-type="bibr" rid="B9">Ding et&#xa0;al., 2022</xref>) and MAXIM (<xref ref-type="bibr" rid="B36">Tu et&#xa0;al., 2022</xref>) have been shown to effectively replace self-attention mechanisms while maintaining high accuracy. These models utilize fully connected layers to capture both local and global context information, making them suitable for complex image segmentation tasks. Additionally, MLPs have been integrated with CNN architectures to form hybrid models that combine the benefits of both approaches. For instance, <xref ref-type="bibr" rid="B37">Valanarasu and Patel (2022)</xref> introduced UNeXt, a convolutional MLP-based network with a U-shaped architecture, comprising three convolution blocks and two tokenized MLP blocks for global information capture and pixel-wise classification. Similarly, the CM-MLP framework (<xref ref-type="bibr" rid="B26">Lv et&#xa0;al., 2022</xref>) integrates multi-scale feature interaction (MSFI) and axial context encoder (ACE) blocks, enhancing local information integration and establishing edge relations between foreground and background regions.</p>
<p>Inspired by the strengths of MLP-based models and transformers, our approach, the Adaptive Multi-Scale MLP (AMS-MLP), combines the benefits of both architectures to address these challenges. The AMS-MLP model integrates multi-path aggregation and multi-scale context relation modules, enabling dynamic fusion of global and local features for accurate segmentation, especially in complex backgrounds. To further highlight the novelty of our work, we provide a comprehensive comparison with existing leaf segmentation methods in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. The table focuses on summarizing the Key Features, Strengths, and Limitations of existing methods, while explicitly outlining How AMS-MLP Differs/Improves over these approaches. Unlike previous approaches, AMS-MLP uniquely leverages adaptive multi-scale feature fusion and context-aware modeling, which significantly improves segmentation accuracy in challenging scenarios. This comparative analysis underscores the advancements of our method and its distinct contributions to the field of leaf segmentation.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Comparison of our proposed AMS-MLP with existing leaf segmentation methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Category</th>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">Key Features</th>
<th valign="middle" align="center">Strengths</th>
<th valign="middle" align="center">Limitations</th>
<th valign="middle" align="center">How AMS-MLP Differs/Improves</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="3" align="center">Traditional Methods</td>
<td valign="middle" align="center">Fuzzy C-means (<xref ref-type="bibr" rid="B23">Liu, 2012</xref>)</td>
<td valign="middle" align="center">Iterative thresholding for segmentation</td>
<td valign="middle" align="center">Simple and effective for basic images</td>
<td valign="middle" align="center">Struggles with complex images; sensitive to noise</td>
<td valign="middle" align="center">AMS-MLP uses dynamic feature fusion, handling complex backgrounds and noise robustly.</td>
</tr>
<tr>
<td valign="middle" align="center">Otsu&#x2019;s Thresholding (<xref ref-type="bibr" rid="B20">Kalaivani et&#xa0;al., 2020</xref>)</td>
<td valign="middle" align="center">Histogram-based thresholding</td>
<td valign="middle" align="center">Works well for bimodal intensity distributions</td>
<td valign="middle" align="center">Fails for images with overlapping intensity distributions</td>
<td valign="middle" align="center">AMS-MLP leverages multi-scale context, overcoming intensity distribution challenges.</td>
</tr>
<tr>
<td valign="middle" align="center">Region Growing (<xref ref-type="bibr" rid="B19">Jothiaruna et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="center">Region-based segmentation</td>
<td valign="middle" align="center">High accuracy for simple leaf structures</td>
<td valign="middle" align="center">Limited by seed point selection and image features</td>
<td valign="middle" align="center">AMS-MLP does not rely on seed points; it adapts to varying leaf structures dynamically.</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">CNN-based<break/>Models</td>
<td valign="middle" align="center">U-Net (<xref ref-type="bibr" rid="B32">Ronneberger et&#xa0;al., 2015</xref>)</td>
<td valign="middle" align="center">Encoder-decoder with skip connections</td>
<td valign="middle" align="center">Strong performance for medical and agricultural images</td>
<td valign="middle" align="center">Struggles with detailed boundary information</td>
<td valign="middle" align="center">AMS-MLP integrates multi-path aggregation for precise boundary delineation.</td>
</tr>
<tr>
<td valign="middle" align="center">SE-Net (<xref ref-type="bibr" rid="B18">Hu et&#xa0;al., 2018</xref>)</td>
<td valign="middle" align="center">Channel-wise attention for global feature enhancement</td>
<td valign="middle" align="center">Enhances global feature representation</td>
<td valign="middle" align="center">Limited ability to suppress irrelevant background information</td>
<td valign="middle" align="center">AMS-MLP uses adaptive bidirectional attention to focus on relevant regions and suppress noise.</td>
</tr>
<tr>
<td valign="middle" align="center">PraNet (<xref ref-type="bibr" rid="B11">Fan et&#xa0;al., 2020</xref>)</td>
<td valign="middle" align="center">Reverse attention for object-boundary relationships</td>
<td valign="middle" align="center">Improves object-boundary relationships</td>
<td valign="middle" align="center">Struggles with fine-grained details in complex backgrounds</td>
<td valign="middle" align="center">AMS-MLP combines multi-scale context and local-global feature fusion for fine-grained details.</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Transformer-based Models</td>
<td valign="middle" align="center">TransUNet (<xref ref-type="bibr" rid="B4">Chen et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="center">Combines U-Net with transformers for global context</td>
<td valign="middle" align="center">Captures long-range dependencies</td>
<td valign="middle" align="center">Struggles with fine-grained details and boundary delineation</td>
<td valign="middle" align="center">AMS-MLP integrates MLP-based local feature extraction with global context for better boundaries.</td>
</tr>
<tr>
<td valign="middle" align="center">BAF-Net (<xref ref-type="bibr" rid="B12">Fang et&#xa0;al., 2023</xref>)</td>
<td valign="middle" align="center">Combines CNNs and Swin Transformers with adaptive bidirectional attention</td>
<td valign="middle" align="center">Captures comprehensive features</td>
<td valign="middle" align="center">Computationally expensive; struggles with fine details</td>
<td valign="middle" align="center">AMS-MLP is computationally efficient and focuses on fine-grained details through multi-scale MLPs.</td>
</tr>
<tr>
<td valign="middle" align="center">AISOA-SSformer (<xref ref-type="bibr" rid="B6">Dai et&#xa0;al., 2024</xref>)</td>
<td valign="middle" align="center">Transformer with sparse global updates and feature attention</td>
<td valign="middle" align="center">High accuracy for rice leaf disease detection</td>
<td valign="middle" align="center">Limited to specific applications; struggles with generalizability</td>
<td valign="middle" align="center">AMS-MLP is generalizable and adaptable to various leaf segmentation tasks.</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">MLP-based<break/>Models</td>
<td valign="middle" align="center">MLP-Mixer (<xref ref-type="bibr" rid="B35">Tolstikhin et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="center">Replaces self-attention with MLPs for image processing</td>
<td valign="middle" align="center">Competitive performance in image classification</td>
<td valign="middle" align="center">Limited exploration in segmentation tasks</td>
<td valign="middle" align="center">AMS-MLP specifically targets segmentation with multi-scale MLPs and dynamic feature fusion.</td>
</tr>
<tr>
<td valign="middle" align="center">UNeXt (<xref ref-type="bibr" rid="B37">Valanarasu and Patel, 2022</xref>)</td>
<td valign="middle" align="center">Hybrid CNN-MLP with U-shaped architecture</td>
<td valign="middle" align="center">Combines CNN and MLP benefits for segmentation</td>
<td valign="middle" align="center">Limited ability to handle complex backgrounds</td>
<td valign="middle" align="center">AMS-MLP enhances local-global feature integration and handles complex backgrounds effectively.</td>
</tr>
<tr>
<td valign="middle" align="center">CM-MLP (<xref ref-type="bibr" rid="B26">Lv et&#xa0;al., 2022</xref>)</td>
<td valign="middle" align="center">Multi-scale feature interaction and axial context encoder</td>
<td valign="middle" align="center">Improves local information integration</td>
<td valign="middle" align="center">Struggles with edge relations in noisy images</td>
<td valign="middle" align="center">AMS-MLP uses adaptive multi-scale context relations for robust edge detection.</td>
</tr>
<tr>
<td valign="middle" align="center">Our Proposed Method</td>
<td valign="middle" align="center">AMS-MLP</td>
<td valign="middle" align="center">Combines MLP-based local feature extraction with global context fusion</td>
<td valign="middle" align="center">Dynamic fusion of global and local features; robust to complex backgrounds</td>
<td valign="middle" align="center">Requires careful tuning of multi-scale parameters</td>
<td valign="middle" align="center">Novelty: Combines MLP and transformer strengths; adaptive multi-scale fusion for segmentation.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<sec id="s3_1">
<label>3.1</label>
<title>Dataset</title>
<p>The pepper leaf image datasets utilized in this study were sourced from a private repository maintained by the Nanchang Academy of Agricultural Sciences. These datasets were collected from their farm located in Nanchang city, Jiangxi Province, China, specifically between August 12 and 13, 2022, using multi-view photography techniques. The camera used for image acquisition was equipped with an F5.6 lens and an EF-S 18-135mm f/3.5-5.6 IS USM microlens manufactured by Canon Company (Japan). During the data collection process, the camera was positioned at a height of 10-50 cm above the leaves to ensure high-resolution images. All images were captured after careful focusing, and the camera remained stationary during shooting to eliminate any motion blur or distortion caused by movement.</p>
<p>To ensure the comprehensiveness and practicality of the data, stringent inclusion criteria were established, encompassing various instances of pepper leaf diseases, including healthy leaves and those affected by viral infections. Specifically, the dataset includes images of leaves severely impacted by common diseases such as early blight, brown spot disease, and leaf mold, along with healthy pepper leaves (HPL) and viral diseases (VD) to enrich the diversity of the dataset. This hybrid database serves as a valuable resource for researching and developing methodologies related to pepper leaf segmentation and disease classification in agricultural research.</p>
<p>To further evaluate the effectiveness of the proposed model in segmenting actual pepper leaves, four distinct datasets were meticulously constructed: Early Blight Dataset (EBD), Brown Spot Dataset (BSD), Leaf Mold Dataset (LMD), and Mixed Leaf Dataset (MLD). These datasets were manually annotated using the open-source tool LabelMe, assigning intensity values of 1 to foreground regions and 0 to background regions. During the data processing phase, we conducted a meticulous statistical analysis to ensure the representativeness and balance of the data across various disease categories and leaf conditions. This analysis involved calculating the distribution of images among different disease categories, conducting rigorous checks on the completeness and quality of annotations, and verifying the accuracy and reliability of the data. As shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>, the datasets for EBD, BSD,LMD, and MLD comprised 1190, 1384, 1385, and 6613 images, respectively. Notably, the MLD dataset integrates image data from EBD, BSD, LMD, as well as healthy pepper leaves and viral diseases, with 1353 images of healthy leaves and 1301 images of viral leaves. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> shows several representative examples from the EBD, BSD, MLD and HPL, respectively. The statistical analysis confirmed the balanced nature of the MLD dataset, and all data were comprehensively annotated. Furthermore, to facilitate a comprehensive evaluation, each dataset was divided into training (70%), validation (10%), and testing (20%) subsets. In the experiment, we standardized the image size for each dataset to 512&#xd7;512 pixels, facilitating consistent processing and analysis across all datasets.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The distribution of the four image datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Dataset</th>
<th valign="middle" align="center">Test</th>
<th valign="middle" align="center">Training</th>
<th valign="middle" align="center">Validation</th>
<th valign="middle" align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">
<bold>Early Blight Dataset (EBD)</bold>
</td>
<td valign="middle" align="center">238</td>
<td valign="middle" align="center">833</td>
<td valign="middle" align="center">119</td>
<td valign="middle" align="center">1190</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Brown Spot Dataset (BSD)</bold>
</td>
<td valign="middle" align="center">277</td>
<td valign="middle" align="center">970</td>
<td valign="middle" align="center">138</td>
<td valign="middle" align="center">1385</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Leaf Mold Dataset (LMD)</bold>
</td>
<td valign="middle" align="center">277</td>
<td valign="middle" align="center">969</td>
<td valign="middle" align="center">138</td>
<td valign="middle" align="center">1384</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Mixed Leaf Dataset(MLD)</bold>
</td>
<td valign="middle" align="center">1323</td>
<td valign="middle" align="center">4629</td>
<td valign="middle" align="center">661</td>
<td valign="middle" align="center">6613</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the best performance metrics in each category.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The sample images in different pure backgrounds.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g001.tif"/>
</fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Method</title>
<p>In this section, we will provide an overview of the AMS-MLP model and discuss the incorporation of three key modules within the encoder-decoder architecture. These modules consist of the adaptive multi-scale MLP module, the multi-scale context relation decoder module, and the multi-path aggregation mask module. Additionally, we will present the loss function utilized in the model. By integrating these modules and utilizing an appropriate loss function, the AMS-MLP model demonstrates improved performance in image segmentation tasks.</p>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Overall architecture</title>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> illustrates the network architecture of the proposed AMS-MLP network, based on a U-shape design. The AMS-MLP model consists of three core components: the encoder network, the AM-MLP module, and the decoder network. The encoder network includes five convolutional layers with four downsampling operations and integrates an MPAM module. Each convolutional block within the encoder comprises a 3&#xd7;3 convolutional layer, batch normalization, ReLU activation, and max-pooling with a stride of 2. Multi-scale features from these layers are combined in the MPAM module to generate a preliminary mask, further refined by the MSRD module for edge information capture. The AM-MLP module, a critical component of the AMS-MLP network, employs self-attention to extract multi-scale features and local information automatically. The decoder network in the AMS-MLP model consists of five convolutional blocks with four upsampling layers and three MSRD modules. Each decoder block includes a 3&#xd7;3 convolutional layer, batch normalization, and ReLU activation. The first MSRD module utilizes the mask from the MPAM module and features from the fifth layer, while subsequent MSRD modules further refine segmentation within the decoder. Deconvolution operations increase image resolution by a factor of 2 per block, restoring finer details lost during downsampling.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Overview of the AMS-MLP framework including the encoder network, the adaptive multi-scale MLP (AM-MLP) module, and the decoder network. The encoder network comprises five convolutional layers incorporating four downsampling operations and a multi-path aggregation mask (MPAM) module. The decoder network comprises five convolutional layers, incorporating four upsampling layers and three MSRD modules. The AM-MLP module is used for the skip connection layer.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g002.tif"/>
</fig>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Adaptive multi-scale MLP module</title>
<p>The MLP module has demonstrated promising performance in the computer vision task, but it struggles with capturing spatial information and extracting global context due to its fully connected nature. To overcome these limitations, MAXIM (<xref ref-type="bibr" rid="B36">Tu et&#xa0;al., 2022</xref>) employs multi-scale MLP modules to extract global and local information. Inspired by MAXIM, we introduce an adaptive multi-scale MLP module that utilizes the self-attention mechanism to automatically extract multi-scale features and local information. As illustrated in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>, the network initially splits the feature maps into two branches: the global multi-scale MLP (GMS-MLP) branch and the local multi-scale MLP (LMS-MLP) branch. The GMS-MLP branch focuses on extracting global features, while the LMS-MLP branch is dedicated to capturing local feature maps. <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref> illustrates the GMS-MLP and LMS-MLP modules. To effectively combine these features, we introduce an adaptive attention module that dynamically adjusts the weights of the global and local features based on their importance and relevance to the task. By incorporating the adaptive multi-scale MLP module, the AM-MLP module enabled the extraction of both global and local information in an adaptive manner while preserving spatial information and capturing contextual cues from different scales.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The network architecture of the AM-MLP module. The input feature map F is split into the global multi-scale MLP (GMS-MLP) branch <bold>F<sup>G</sup>
</bold> and the local multi-scale MLP (LMS-MLP) branch <bold>F<sup>L</sup>
</bold>. After each branch with multiple Cascade MLP blocks, the resulting features are alternately multiplied to enhance information interaction and then added together. Then, multi-scale features and local information are automatically extracted using an adaptive attention mechanism.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g003.tif"/>
</fig>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Illustration of the GMS-MLP and LMS-MLP modules. As an example, we used <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;&#x200a;&#x200a;&#x200a;&#x200a;C</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;&#x200a;&#x200a;&#x200a;&#x200a;W</mml:mtext>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (W = 16; H = 16) as input, where B is the batch size, and C is the channel number. Input feature <inline-formula>
<mml:math display="inline" id="im2">
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
</mml:math>
</inline-formula> will be processed by GMS-MLP and LMS-MLP branches. In the GMS-MLP branch, the feature map <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is initially divided into non-overlapping patches of size 2 &#xd7; 2, resulting in a grid of size 8&#xd7; 8. These patches are then flattened and fed into a fully connected (FC) layer along the first axis. Finally, the output is reshaped back and ungridded to restore the original size. In the LMS-MLP branch, the feature map <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is divided into non-overlapping patches of size 8 &#xd7; 8, resulting in a blocking of size 2 &#xd7; 2. These patches are flattened and processed through an FC layer along the second axis. Following that, the output is reshaped back and unblocked to regain the original size, resulting in the feature map <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g004.tif"/>
</fig>
<p>Specially, the input features <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;W</mml:mtext>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> undergo an initial split into two branches based on the channel dimension, namely the GMS-MLP branch <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211d;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;C</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;W</mml:mtext>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and the LMS-MLP branch <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;C</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;W</mml:mtext>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where B represents the batch size, C represents the channel number, and H and W represent the height and width of the image, respectively. In the GMS-MLP branch, the input features are first passed through a fully connected (FC) layer, followed by a layer normalization (LN) layer. The next step involves applying an additional fully connected (FC) layer and a GELU activation layer to generate the feature map <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;C</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;W</mml:mtext>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The generated feature map is then transformed into non-overlapping image patches, where each patch consists of a certain number of <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:mtext>g</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>g</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> grids. These patched features <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>p</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;C</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;g</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>g</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;</mml:mtext>
<mml:msub>
<mml:mtext>H</mml:mtext>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>W</mml:mtext>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are further processed through three consecutive multi-scale MLP modules, where <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mtext>H</mml:mtext>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>H</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mtext>g</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;</mml:mtext>
<mml:msub>
<mml:mtext>W</mml:mtext>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>W</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mtext>g</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>, and g is the kernel size. This process leads to the generation of novel feature maps <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, which can be denoted as <xref ref-type="disp-formula" rid="eq1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="eq4">4</xref>:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;W</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;W</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>p</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>&#x2009;&#x2009;&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mtext>patch</mml:mtext>
</mml:mrow>
<mml:mtext>G</mml:mtext>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;g</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>g</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mtext>H</mml:mtext>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>W</mml:mtext>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>p</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;g</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;</mml:mtext>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;</mml:mtext>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denoting dividing a multidimensional matrix or tensor into multiple sub-tensors along a channel dimension, and <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the full connection layer. <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes layer normalization layer, <inline-formula>
<mml:math display="inline" id="im17">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the GELU activation function, <inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mtext>&#x200a;</mml:mtext>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the operation of changing the shape or dimensions of two feature matrices. The GMS-MLP branch <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is three continuous MLP modules with the grid sizes of <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im21">
<mml:mrow>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula>
<mml:math display="inline" id="im22">
<mml:mrow>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>g</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively.</p>
<p>Similarly, in the LMS-MLP branch, the LMS-MLP feature <inline-formula>
<mml:math display="inline" id="im23">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> passes through a FC layer, a layer normalization (LN) layer. Subsequently, it passes a FC layer and a GELU activation layer. The novel feature maps <inline-formula>
<mml:math display="inline" id="im24">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;W</mml:mtext>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are projected into non-overlapping image patches and generate a new feature maps <inline-formula>
<mml:math display="inline" id="im25">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>b</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;b</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>b</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mtext>H</mml:mtext>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>W</mml:mtext>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula>
<mml:math display="inline" id="im26">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mtext>H</mml:mtext>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>H</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mtext>b</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;</mml:mtext>
<mml:msub>
<mml:mtext>W</mml:mtext>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>W</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mtext>b</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>, b is the kernel size, and the size of each image patch is <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:mtext>b</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>b</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> grids. Then, the feature maps <inline-formula>
<mml:math display="inline" id="im28">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> pass three continuous multi-scale MLP modules to obtain the spatial information, which is written as <xref ref-type="disp-formula" rid="eq5">Equations 5</xref>&#x2013;<xref ref-type="disp-formula" rid="eq7">7</xref>:</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>b</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;</mml:mtext>
<mml:msubsup>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mtext>block</mml:mtext>
</mml:mrow>
<mml:mtext>L</mml:mtext>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;b</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>b</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mtext>H</mml:mtext>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>W</mml:mtext>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mtext>b</mml:mtext>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>b</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;b</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im29">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is three continuous MLP modules with the grid sizes of <inline-formula>
<mml:math display="inline" id="im30">
<mml:mrow>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im31">
<mml:mrow>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula>
<mml:math display="inline" id="im32">
<mml:mrow>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>b</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively.</p>
<p>A self-attention module is employed to effectively fuse two features <inline-formula>
<mml:math display="inline" id="im33">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im34">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> obtained from the GMS-MLP and LMS-MLP branches, and it guides the segmented network to select more representative features from the channel dimension. Specially, two features <inline-formula>
<mml:math display="inline" id="im35">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mtext>G</mml:mtext>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im36">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mtext>L</mml:mtext>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are fused, and followed by the global average pooling (GAP) operation to compress the channel dimension, which can be represented as follows <xref ref-type="disp-formula" rid="eq8">Equation 8</xref>:</p>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>H</mml:mtext>
<mml:mi>G</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mtext>G</mml:mtext>
</mml:msubsup>
<mml:mo>&#x2295;</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mtext>L</mml:mtext>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im37">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>H</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is the output features of the GAP layer. Then, the features <inline-formula>
<mml:math display="inline" id="im38">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>H</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are input into a FC layer, followed by a batch normalization layer, and a softmax function. The probability feature maps <inline-formula>
<mml:math display="inline" id="im39">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>H</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
<mml:mi>C</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> can be expressed as <xref ref-type="disp-formula" rid="eq9">Equation 9</xref>:</p>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>H</mml:mtext>
<mml:mrow>
<mml:mtext>FC</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>(</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>H</mml:mtext>
<mml:mi>G</mml:mi>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im40">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the sigmoid activation function, and <inline-formula>
<mml:math display="inline" id="im41">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a batch normalization layer. Then, we perform another FC layer on the features <inline-formula>
<mml:math display="inline" id="im42">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>H</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
<mml:mi>C</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> followed by the softmax activation function, and the channel attention map <inline-formula>
<mml:math display="inline" id="im43">
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3b1;</mml:mi>
</mml:mstyle>
</mml:math>
</inline-formula> is written as <xref ref-type="disp-formula" rid="eq10">Equation 10</xref>:</p>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3b1;</mml:mi>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>H</mml:mtext>
<mml:mrow>
<mml:mtext>FC</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im44">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the softmax activation layer. We regard the channel attention map <inline-formula>
<mml:math display="inline" id="im45">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3b1;</mml:mi>
</mml:mstyle>
<mml:mo>&#x2208;</mml:mo>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> as the weight of the features, where <inline-formula>
<mml:math display="inline" id="im46">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The channel attention map <inline-formula>
<mml:math display="inline" id="im47">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3b1;</mml:mi>
</mml:mstyle>
<mml:mo>'</mml:mo>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is from the value <inline-formula>
<mml:math display="inline" id="im48">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula>, and it satisfies <inline-formula>
<mml:math display="inline" id="im49">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3b1;</mml:mi>
</mml:mstyle>
<mml:mo>'</mml:mo>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3b1;</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula>. An important observation is that the channel attention maps <inline-formula>
<mml:math display="inline" id="im50">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im51">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>'</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> enable the adaptive adjustment of weights for the two channel attention feature maps. It also demonstrates that the two feature maps are capable of extracting feature representations from different receptive fields. By flexibly adjusting the adaptive weights of two features <inline-formula>
<mml:math display="inline" id="im52">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im53">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
<mml:mi>c</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, the feature maps can be expressed as <xref ref-type="disp-formula" rid="eq11">Equations 11</xref>&#x2013;<xref ref-type="disp-formula" rid="eq13">13</xref>:</p>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>'</mml:mo>
</mml:mstyle>
</mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#xd7;</mml:mo>
</mml:mstyle>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
<mml:mi>C</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>'</mml:mo>
</mml:mstyle>
</mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3b1;</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>'</mml:mo>
</mml:mstyle>
</mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>&#xd7;</mml:mo>
</mml:mstyle>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
<mml:mi>C</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>G</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>'</mml:mo>
</mml:mstyle>
</mml:msubsup>
<mml:mo>&#x2299;</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>L</mml:mi>
</mml:mstyle>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>'</mml:mo>
</mml:mstyle>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im54">
<mml:mo>&#x2299;</mml:mo>
</mml:math>
</inline-formula> denotes the concatenation operator, <inline-formula>
<mml:math display="inline" id="im55">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>G</mml:mtext>
<mml:mo>'</mml:mo>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mtext>&#x200a;&#x200a;&#x200a;&#x200a;&#x200a;</mml:mtext>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>L</mml:mtext>
<mml:mo>'</mml:mo>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211c;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mtext>B</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;C</mml:mtext>
<mml:mo stretchy="false">/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;H</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2004;W</mml:mtext>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are two output features from the adaptive dot-product features, respectively.</p>
<p>Notably, the grid size <bold>g</bold> and the block size <bold>b</bold> satisfy a specific relationship. As exemplified in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>, the network structure of the GMS-MLP and LMS-MLP branches is depicted. When reducing the patch size in the GMS-MLP block, the block size in the LMS-MLP branch increases accordingly. For instance, when considering an image size of 32, the grid sizes in the GMS-MLP branch are set to 8, 4, and 2, while the corresponding grid sizes in the LMS-MLP branch are 4, 8, and 16, respectively. This arrangement results in a larger number of patches within the global MLP, enabling the capture of spatial information among the patches. Conversely, in the LMS-MLP branch, a larger number of pixels in each block allows for the retention of local spatial information between pixels. Consequently, by fusing the GMS-MLP and LMS-MLP blocks, a comprehensive feature map can be generated, encompassing both global and local information in a progressively richer manner.</p>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Multi-scale context relation decoder module</title>
<p>The accurate extraction of boundaries between foreground and background regions relies on the presence of both local and contextual information. To address this, the Mask refinement network (<xref ref-type="bibr" rid="B33">Tang et&#xa0;al., 2021</xref>) leverages contextual relationships to improve the pixel boundaries in these regions. In line with this, we propose an MCRD module to enhance the target boundary features and contextual information. As shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, our approach involves initial upsampling of the high feature maps <inline-formula>
<mml:math display="inline" id="im56">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> through non-linear interpolation with a rate of 2, followed by a sigmoid activation function. The novel feature maps are then fed into a <inline-formula>
<mml:math display="inline" id="im57">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional block, which generates an output with a single channel. The process is formulated as <xref ref-type="disp-formula" rid="eq14">Equations 14</xref>, <xref ref-type="disp-formula" rid="eq15">15</xref>:</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Illustration of the multi-scale context relation decoder (MCRD) module. Two feature maps <inline-formula>
<mml:math display="inline" id="im58">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im59">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are input into the MCRD module, the high features is first performed on the upsampling operation. The generated feature maps <inline-formula>
<mml:math display="inline" id="im60">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> pass through the sigmoid activation function and a <inline-formula>
<mml:math display="inline" id="im61">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional operation, which generates the mask maps <inline-formula>
<mml:math display="inline" id="im62">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>mask</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> representing the foreground and background regions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g005.tif"/>
</fig>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>mask</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im63">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the upsampling operator, <inline-formula>
<mml:math display="inline" id="im64">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a <inline-formula>
<mml:math display="inline" id="im65">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional operation.</p>
<p>Then, the mask maps <inline-formula>
<mml:math display="inline" id="im66">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>mask</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is used to assign different weights of the foreground and background feature maps, which are written as <xref ref-type="disp-formula" rid="eq16">Equations 16</xref>, <xref ref-type="disp-formula" rid="eq17">17</xref>:</p>
<disp-formula id="eq16">
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>fg</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
</mml:msup>
<mml:mo>&#x2297;</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>mask</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq17">
<label>(17)</label>
<mml:math display="block" id="M17">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>bg</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
</mml:msup>
<mml:mo>&#x2297;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>mask</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im67">
<mml:mo>&#x2297;</mml:mo>
</mml:math>
</inline-formula> denotes the dot product, <inline-formula>
<mml:math display="inline" id="im68">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a <inline-formula>
<mml:math display="inline" id="im69">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional block.</p>
<p>Finally, we concatenate two feature maps <inline-formula>
<mml:math display="inline" id="im70">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>fg</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im71">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>bg</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> on the channel dimension, and it then perform a <inline-formula>
<mml:math display="inline" id="im72">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional layer, which is written as <xref ref-type="disp-formula" rid="eq18">Equation 18</xref>
</p>
<disp-formula id="eq18">
<label>(18)</label>
<mml:math display="block" id="M18">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>bg</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>fg</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2299;</mml:mo>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>bg</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s3_2_4">
<label>3.2.4</label>
<title>Multi-path aggregation mask module</title>
<p>The multi-scale nature of features in deep neural networks offers different levels of information, with deeper layers capturing coarser details and shallower layers preserving finer details. To leverage the benefits of each layer, we introduce an MPAM module to enhance the extraction of accurate boundary information and facilitate the generation of masks. As shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, for the feature map <inline-formula>
<mml:math display="inline" id="im73">
<mml:mrow>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the fifth layer to the second layer in the encoder, each feature map is subjected to a <inline-formula>
<mml:math display="inline" id="im74">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional operation to decrease the channel dimensions. The resulting feature maps have the same channel number as the first layer in the encoder. Additionally, we employ an upsampling operation with a rate of 2 on these feature maps. This procedure can be expressed as <xref ref-type="disp-formula" rid="eq19">Equation 19</xref>:</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Illustration of the multi-path mask decoder module. From the fifth to second layers, the feature maps <inline-formula>
<mml:math display="inline" id="im75">
<mml:mrow>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the encoder are first passing a <inline-formula>
<mml:math display="inline" id="im76">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional operation to suppress the channel number, and the generated channel number of the output features is the same to that of the first layer in the encoder.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g006.tif"/>
</fig>
<disp-formula id="eq19">
<label>(19)</label>
<mml:math display="block" id="M19">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;i</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Subsequently, the generated feature maps are further processed by the Sigmoid activation function. We then concatenate the generated feature <inline-formula>
<mml:math display="inline" id="im77">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and the previous feature maps <inline-formula>
<mml:math display="inline" id="im78">
<mml:mrow>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the final feature maps are written as <xref ref-type="disp-formula" rid="eq20">Equation 20</xref>:</p>
<disp-formula id="eq20">
<label>(20)</label>
<mml:math display="block" id="M20">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x2299;</mml:mo>
<mml:mtext>&#x200a;&#x200a;&#x200a;</mml:mtext>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>-</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;i</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<p>To incorporate information from various scales, we utilize an element-wise addition operation between four upsampling feature maps and the feature maps <inline-formula>
<mml:math display="inline" id="im79">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> obtained from the first layer in the encoder. This operation produces multi-scale fusion feature maps (MSFF), which can be denoted as <xref ref-type="disp-formula" rid="eq21">Equation 21</xref>:</p>
<disp-formula id="eq21">
<label>(21)</label>
<mml:math display="block" id="M21">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>cat</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:msubsup>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2299;</mml:mo>
<mml:mo>}</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:msubsup>
<mml:mtext>&#x2004;</mml:mtext>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In the final step, we concatenate the MSFF maps <inline-formula>
<mml:math display="inline" id="im80">
<mml:mrow>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>cat</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mtext>&#x200a;</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> with the feature maps <inline-formula>
<mml:math display="inline" id="im81">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> obtained from the first layer. The concatenated feature maps <inline-formula>
<mml:math display="inline" id="im82">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mtext>en</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are then fed into a convolutional block with <inline-formula>
<mml:math display="inline" id="im83">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> filters. To generate the mask for the foreground and background regions, we apply a <inline-formula>
<mml:math display="inline" id="im84">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolutional operation with a single output channel, followed by a downsampling operation. Mathematically, this can be represented as <xref ref-type="disp-formula" rid="eq22">Equations 22</xref>, <xref ref-type="disp-formula" rid="eq23">23</xref>:</p>
<disp-formula id="eq22">
<label>(22)</label>
<mml:math display="block" id="M22">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mtext>en</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:msup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>cat</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2299;</mml:mo>
<mml:mtext>&#x200a;&#x200a;&#x200a;</mml:mtext>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq23">
<label>(23)</label>
<mml:math display="block" id="M23">
<mml:mrow>
<mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>up</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mo>=</mml:mo>
</mml:mstyle>
<mml:mi>d</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>(</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>F</mml:mi>
</mml:mstyle>
<mml:mtext>i</mml:mtext>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>)</mml:mo>
<mml:mtext>&#x2004;&#x2004;&#x2004;&#x2004;&#x2004;i</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im85">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the downsampling operator.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental setup</title>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental environment configuration</title>
<p>The proposed AMS-MLP network was implemented on a specific hardware configuration consisting of a 12th Gen Intel(R) Core(TM) i7-12700K 3.60 GHz processor and an NVIDIA GeForce RTX 3090 40 GB GPU with 32 GB of RAM. The operating system employed was Windows 11, and the Conda environment was utilized to ensure a consistent software environment for the execution.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental scheme</title>
<p>For the parameter settings of the AMS-MLP network, we employed a batch size of 2 and trained the model for a total of 60 epochs. The optimization process was performed using stochastic gradient descent (SGD) with an initial learning rate of 0.001, and the learning rate was adjusted according to the learning rate schedule. During training, these parameter choices facilitated effective convergence of the AMS-MLP network. To assess the model&#x2019;s performance, we compared it against several state-of-the-art (SOTA) models using various performance metrics. Additionally, ablation studies were conducted to evaluate the impact of different network components, such as the choice of activation functions and the depth of layers, on the overall performance. In our experiments, we also tested other values for batch size, learning rate, and epoch count, but found that the chosen configuration yielded the best performance in terms of both training stability and testing accuracy.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Training loss</title>
<p>The proposed AMS-MLP network involves two loss functions to optimize the predicted result and the ground truth (GT), including the binary cross entropy (BCE) <inline-formula>
<mml:math display="inline" id="im86">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the Dice <inline-formula>
<mml:math display="inline" id="im87">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The two loss functions are defined as <xref ref-type="disp-formula" rid="eq24">Equations 24</xref>, <xref ref-type="disp-formula" rid="eq25">25</xref>:</p>
<disp-formula id="eq24">
<label>(24)</label>
<mml:math display="block" id="M24">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq25">
<label>(25)</label>
<mml:math display="block" id="M25">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>f</italic> denotes the input predicted result, and <italic>g</italic> denotes the corresponding ground truth label.</p>
<p>Therefore, our final loss <inline-formula>
<mml:math display="inline" id="im88">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be expressed as <xref ref-type="disp-formula" rid="eq26">Equation 26</xref>:</p>
<disp-formula id="eq26">
<label>(26)</label>
<mml:math display="block" id="M26">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Performance evaluation</title>
<p>To rigorously evaluate the performance of the proposed method and other compared methods, six metrics are employed as evaluation criteria: accuracy, recall, precision, specificity, F1-score, and intersection over union (IoU). Here&#x2019;s a detailed breakdown of these metrics. These metrics are defined as follows:</p>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Accuracy</title>
<p>Accuracy measures the overall correctness of the predictions, calculated as the ratio of correctly predicted pixels (both foreground and background) to the total number of pixels.</p>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Recall</title>
<p>Recall (also known as sensitivity) measures the ability of the model to identify all relevant instances (foreground pixels), calculated as the ratio of true positives (TP) to the sum of true positives and false negatives (FN).</p>
</sec>
<sec id="s4_4_3">
<label>4.4.3</label>
<title>Precision</title>
<p>Precision measures the accuracy of the positive predictions, calculated as the ratio of true positives to the sum of true positives and false positives (FP).</p>
</sec>
<sec id="s4_4_4">
<label>4.4.4</label>
<title>Specificity</title>
<p>Specificity measures the ability of the model to identify all irrelevant instances (background pixels), calculated as the ratio of true negatives (TN) to the sum of true negatives and false positives.</p>
</sec>
<sec id="s4_4_5">
<label>4.4.5</label>
<title>F1-score</title>
<p>F1-score is a harmonic mean of precision and recall, providing a single metric that balances both the precision and the recall of the model. It is particularly useful when the classes are of unequal size or when there is a trade-off between precision and recall.</p>
</sec>
<sec id="s4_4_6">
<label>4.4.6</label>
<title>Intersection over Union</title>
<p>Intersection over Union (IoU) is a metric commonly used to evaluate the accuracy of boundary predictions. It measures the overlap between the predicted border and the real border by calculating the ratio of their intersection to their union.</p>
<p>These metrics are standard measures widely used for performance evaluation, and they are defined as <xref ref-type="disp-formula" rid="eq27">Equations 27</xref>&#x2013;<xref ref-type="disp-formula" rid="eq31">31</xref>:</p>
<disp-formula id="eq27">
<label>(27)</label>
<mml:math display="block" id="M27">
<mml:mrow>
<mml:mtext>accuracy</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq28">
<label>(28)</label>
<mml:math display="block" id="M28">
<mml:mrow>
<mml:mtext>recall</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq29">
<label>(29)</label>
<mml:math display="block" id="M29">
<mml:mrow>
<mml:mtext>precision</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq30">
<label>(30)</label>
<mml:math display="block" id="M30">
<mml:mrow>
<mml:mtext>specificity</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq31">
<label>(30)</label>
<mml:math display="block" id="M31">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq32">
<label>(31)</label>
<mml:math display="block" id="M32">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where TP (true positive) represents the number of pixels that are correctly predicted as foreground, TN (true negative) indicates the number of pixels that are correctly predicted as background, FP (false positive) refers to the number of pixels that are predicted as foreground but actually belong to the background according to the ground truth. On the other hand, FN (false negative) represents the number of pixels that are predicted as background but actually belong to the foreground according to the ground truth.</p>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="results">
<label>5</label>
<title>Results and discussion</title>
<sec id="s5_1">
<label>5.1</label>
<title>Comparison with the SOTA models</title>
<p>To evaluate the segmentation performance of the AMS-MLP model, we conducted a comparative study against state-of-the-art (SOTA) models using three distinct leaf datasets: EBD, BSD, and MLD. The models included FCN-VGG16, U-Net (<xref ref-type="bibr" rid="B32">Ronneberger et&#xa0;al., 2015</xref>), attention U-Net (AttU-Net) (<xref ref-type="bibr" rid="B30">Oktay et&#xa0;al., 2018</xref>), UNet++ (<xref ref-type="bibr" rid="B41">Zhou et&#xa0;al., 2019</xref>), UNeXt (<xref ref-type="bibr" rid="B37">Valanarasu and Patel, 2022</xref>), and CM-MLP model (<xref ref-type="bibr" rid="B26">Lv et&#xa0;al., 2022</xref>). To ensure a fair and comprehensive comparison, all models were trained, validated, and tested on the same three datasets. By maintaining consistency across the training, validation, and test datasets, we aim to eliminate any potential bias or variation that may affect the results and evaluate the segmentation performance exclusively on the test dataset.</p>
<p>
<xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref> and <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref> present the segmentation results of the AMS-MLP model compared to seven other models on the EBD dataset, evaluated across six metrics: accuracy, recall, precision, mean Intersection over Union (mIoU), and F1-score. Our model achieves the highest scores across five of these metrics, notably achieving 97.39% in mIoU and 98.29% in F1-score, surpassing FCN by 0.35% and 0.29%, respectively. These superior metrics reflect the AMS-MLP model&#x2019;s ability to leverage the GMS-MLP and LMS-MLP modules effectively. The GMS-MLP module captures global context, enabling robust feature extraction across the entire image, while the LMS-MLP module enhances local detail recognition within specific leaf structures. This dual-stream approach ensures comprehensive information integration, leading to enhanced segmentation accuracy. Compared to U-Net, our model demonstrates significant improvements with increases of 9.92% in mIoU, 0.18% in F1-score, and 0.29% in recall. This enhancement can be attributed to the AMS-MLP&#x2019;s capacity to combine both global and local features effectively, thereby improving boundary delineation and reducing segmentation errors. Furthermore, compared to other semantic segmentation models, our approach achieves the highest scores in accuracy, recall, precision, mIoU, and F1-score, underscoring its robust performance across multiple evaluation criteria. Qualitative examples in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref> highlight the models&#x2019; ability to locate object regions accurately, with our proposed model notably delineating pepper leaf boundaries with precision. This visual evidence further substantiates the effectiveness and superiority of the AMS-MLP model in leaf segmentation tasks.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>The results of segmenting the EBD dataset using seven different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" colspan="2" align="center">Model</th>
<th valign="middle" align="center">Accuracy (%)</th>
<th valign="middle" align="center">Recall (%)</th>
<th valign="middle" align="center">Specificity (%)</th>
<th valign="middle" align="center">Precision (%)</th>
<th valign="middle" align="center">mIoU (%)</th>
<th valign="middle" align="center">F1-score (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">
<bold>FCN</bold>
</td>
<td valign="middle" align="center">
<bold>FCN-16s</bold>
</td>
<td valign="middle" align="center">99.45</td>
<td valign="middle" align="center">97.33</td>
<td valign="middle" align="center">99.79</td>
<td valign="middle" align="center">98.67</td>
<td valign="middle" align="center">97.04</td>
<td valign="middle" align="center">98.00</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">
<bold>UNet-based</bold>
</td>
<td valign="middle" align="center">
<bold>U-Net</bold>
</td>
<td valign="middle" align="center">99.53</td>
<td valign="middle" align="center">97.31</td>
<td valign="middle" align="center">
<bold>99.85</bold>
</td>
<td valign="middle" align="center">98.92</td>
<td valign="middle" align="center">87.47</td>
<td valign="middle" align="center">98.11</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>AttU-Net</bold>
</td>
<td valign="middle" align="center">99.26</td>
<td valign="middle" align="center">96.29</td>
<td valign="middle" align="center">99.74</td>
<td valign="middle" align="center">98.38</td>
<td valign="middle" align="center">96.06</td>
<td valign="middle" align="center">97.33</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>UNet++</bold>
</td>
<td valign="middle" align="center">99.43</td>
<td valign="middle" align="center">97.04</td>
<td valign="middle" align="center">99.82</td>
<td valign="middle" align="center">98.87</td>
<td valign="middle" align="center">96.95</td>
<td valign="middle" align="center">97.94</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">
<bold>MLP-based</bold>
</td>
<td valign="middle" align="center">
<bold>UNeXt</bold>
</td>
<td valign="middle" align="center">99.31</td>
<td valign="middle" align="center">96.31</td>
<td valign="middle" align="center">99.79</td>
<td valign="middle" align="center">98.67</td>
<td valign="middle" align="center">96.38</td>
<td valign="middle" align="center">97.48</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>CM-MLP</bold>
</td>
<td valign="middle" align="center">99.44</td>
<td valign="middle" align="center">97.41</td>
<td valign="middle" align="center">99.77</td>
<td valign="middle" align="center">98.54</td>
<td valign="middle" align="center">96.96</td>
<td valign="middle" align="center">97.97</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Ours</bold>
</td>
<td valign="middle" align="center">
<bold>99.53</bold>
</td>
<td valign="middle" align="center">
<bold>97.61</bold>
</td>
<td valign="middle" align="center">99.84</td>
<td valign="middle" align="center">
<bold>98.97</bold>
</td>
<td valign="middle" align="center">
<bold>97.39</bold>
</td>
<td valign="middle" align="center">
<bold>98.29</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the best performance metrics in each category.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Bar chart comparison of seven models&#x2019; performance on the EBD dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g007.tif"/>
</fig>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Qualitative comparison of the proposed model compared with six models on the EBD dataset, and five examples of the predicted results are shown. From the 1<sup>st</sup> column to 9<sup>th</sup> column: the original image, the predicted results corresponding to FCN-VGG16, U-Net, AttUNet, UNet++, UNeXt, CM-MLP, our model, and the ground truth, respectively.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g008.tif"/>
</fig>
<p>Additionally, for a comprehensive evaluation of training performance, we conducted a comparative analysis of the AMS-MLP network against FCN-based, U-Net-based, and additional MLP-based models, utilizing the BSD and MLD datasets as benchmarks. The results of this comparison are presented in <xref ref-type="table" rid="T4">
<bold>Tables&#xa0;4</bold>
</xref>, <xref ref-type="table" rid="T5">
<bold>5</bold>
</xref>, and <xref ref-type="fig" rid="f9">
<bold>Figures&#xa0;9</bold>
</xref>, <xref ref-type="fig" rid="f10">
<bold>10</bold>
</xref>. Our method consistently outperformed other models across five evaluation metrics. This improvement can be attributed to the integration of GMS-MLP and LMS-MLP modules, enabling the extraction of both global and local information crucial for enhancing segmentation accuracy. The FCN-16s model&#x2019;s superior performance over U-Net is attributed to its use of pretrained VGG16, which enriches feature extraction and representation within the encoder. Similarly, the CM-MLP model, by leveraging MLP instead of traditional attention mechanisms, achieves superior segmentation results by effectively considering pixel relationships.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>The results of segmenting the BSD dataset using seven different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" colspan="2" align="center">Model</th>
<th valign="middle" align="center">Accuracy (%)</th>
<th valign="middle" align="center">Recall (%)</th>
<th valign="middle" align="center">Specificity (%)</th>
<th valign="middle" align="center">Precision (%)</th>
<th valign="middle" align="center">mIoU (%)</th>
<th valign="middle" align="center">F1-score (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">
<bold>FCN</bold>
</td>
<td valign="middle" align="center">
<bold>FCN-16s</bold>
</td>
<td valign="middle" align="center">99.69</td>
<td valign="middle" align="center">97.17</td>
<td valign="middle" align="center">99.87</td>
<td valign="middle" align="center">98.11</td>
<td valign="middle" align="center">96.62</td>
<td valign="middle" align="center">97.64</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">
<bold>UNet-based</bold>
</td>
<td valign="middle" align="center">
<bold>U-Net</bold>
</td>
<td valign="middle" align="center">98.83</td>
<td valign="middle" align="center">93.85</td>
<td valign="middle" align="center">99.18</td>
<td valign="middle" align="center">88.95</td>
<td valign="middle" align="center">96.14</td>
<td valign="middle" align="center">91.33</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>AttU-Net</bold>
</td>
<td valign="middle" align="center">99.62</td>
<td valign="middle" align="center">98.05</td>
<td valign="middle" align="center">99.73</td>
<td valign="middle" align="center">96.26</td>
<td valign="middle" align="center">95.97</td>
<td valign="middle" align="center">97.14</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>UNet++</bold>
</td>
<td valign="middle" align="center">99.37</td>
<td valign="middle" align="center">
<bold>98.08</bold>
</td>
<td valign="middle" align="center">99.46</td>
<td valign="middle" align="center">92.68</td>
<td valign="middle" align="center">95.75</td>
<td valign="middle" align="center">95.31</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">
<bold>MLP-based</bold>
</td>
<td valign="middle" align="center">
<bold>UNeXt</bold>
</td>
<td valign="middle" align="center">99.37</td>
<td valign="middle" align="center">97.70</td>
<td valign="middle" align="center">99.49</td>
<td valign="middle" align="center">93.06</td>
<td valign="middle" align="center">94.66</td>
<td valign="middle" align="center">95.33</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>CM-MLP</bold>
</td>
<td valign="middle" align="center">99.66</td>
<td valign="middle" align="center">96.95</td>
<td valign="middle" align="center">99.85</td>
<td valign="middle" align="center">97.83</td>
<td valign="middle" align="center">95.68</td>
<td valign="middle" align="center">97.39</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Ours</bold>
</td>
<td valign="middle" align="center">
<bold>99.72</bold>
</td>
<td valign="middle" align="center">97.47</td>
<td valign="middle" align="center">
<bold>99.88</bold>
</td>
<td valign="middle" align="center">
<bold>98.26</bold>
</td>
<td valign="middle" align="center">
<bold>96.91</bold>
</td>
<td valign="middle" align="center">
<bold>97.86</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the best performance metrics in each category.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>The results of segmenting the MLD dataset using seven different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" colspan="2" align="center">Model</th>
<th valign="middle" align="center">Accuracy (%)</th>
<th valign="middle" align="center">Recall (%)</th>
<th valign="middle" align="center">Specificity (%)</th>
<th valign="middle" align="center">Precision (%)</th>
<th valign="middle" align="center">mIoU (%)</th>
<th valign="middle" align="center">F1-score (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">
<bold>FCN</bold>
</td>
<td valign="middle" align="center">
<bold>FCN-16s</bold>
</td>
<td valign="middle" align="center">99.61</td>
<td valign="middle" align="center">96.93</td>
<td valign="middle" align="center">99.89</td>
<td valign="middle" align="center">98.94</td>
<td valign="middle" align="center">97.10</td>
<td valign="middle" align="center">97.92</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">
<bold>UNet-based</bold>
</td>
<td valign="middle" align="center">
<bold>U-Net</bold>
</td>
<td valign="middle" align="center">99.46</td>
<td valign="middle" align="center">95.40</td>
<td valign="middle" align="center">99.88</td>
<td valign="middle" align="center">98.80</td>
<td valign="middle" align="center">96.19</td>
<td valign="middle" align="center">97.07</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>AttU-Net</bold>
</td>
<td valign="middle" align="center">99.57</td>
<td valign="middle" align="center">96.43</td>
<td valign="middle" align="center">99.90</td>
<td valign="middle" align="center">99.03</td>
<td valign="middle" align="center">97.05</td>
<td valign="middle" align="center">97.71</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>UNet++</bold>
</td>
<td valign="middle" align="center">98.87</td>
<td valign="middle" align="center">89.58</td>
<td valign="middle" align="center">99.83</td>
<td valign="middle" align="center">98.25</td>
<td valign="middle" align="center">92.15</td>
<td valign="middle" align="center">93.71</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">
<bold>MLP-based</bold>
</td>
<td valign="middle" align="center">
<bold>UNeXt</bold>
</td>
<td valign="middle" align="center">99.20</td>
<td valign="middle" align="center">92.84</td>
<td valign="middle" align="center">99.86</td>
<td valign="middle" align="center">98.56</td>
<td valign="middle" align="center">94.24</td>
<td valign="middle" align="center">95.61</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>CM-MLP</bold>
</td>
<td valign="middle" align="center">99.71</td>
<td valign="middle" align="center">
<bold>98.02</bold>
</td>
<td valign="middle" align="center">99.88</td>
<td valign="middle" align="center">98.85</td>
<td valign="middle" align="center">97.32</td>
<td valign="middle" align="center">98.44</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Ours</bold>
</td>
<td valign="middle" align="center">
<bold>99.72</bold>
</td>
<td valign="middle" align="center">97.79</td>
<td valign="middle" align="center">
<bold>99.92</bold>
</td>
<td valign="middle" align="center">
<bold>99.24</bold>
</td>
<td valign="middle" align="center">
<bold>97.91</bold>
</td>
<td valign="middle" align="center">
<bold>98.51</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the best performance metrics in each category.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Bar chart comparison of seven models&#x2019; performance on the BSD dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g009.tif"/>
</fig>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Bar chart comparison of seven models&#x2019; performance on the MLD dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g010.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>, <xref ref-type="fig" rid="f12">
<bold>12</bold>
</xref> provide visual insights into segmentation outputs, revealing that U-Net and UNet++ models exhibit certain false positive regions in lesion segmentation. In contrast, AttU-Net shows improved performance over U-Net, while the AMS-MLP model closely approximates ground truth, demonstrating precise extraction of pepper leaf boundaries and reduced false positive regions. This performance superiority is facilitated by the MCRD module and the utilization of GMS-MLP and LMS-MLP auxiliary streams, which facilitate effective cascaded contraction and expansion processes within the network.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Qualitative comparison of the proposed model compared with six models on the BSD dataset, and five examples of the predicted results are shown. From the 1<sup>st</sup> column to 9<sup>th</sup> column: the original image, the predicted results corresponding to FCN-VGG16, U-Net, AttUNet, UNet++, UNeXt, CM-MLP, our model, and the ground truth, respectively.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g011.tif"/>
</fig>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Qualitative comparison of the proposed model compared with six models on the MLD dataset, and five examples of the predicted results are shown. From the 1<sup>st</sup> column to the 9<sup>th</sup> column: the original image, the predicted results corresponding to FCN-VGG16, U-Net, attention U-Net (AttUNet), UNet++, UNeXt, CM-MLP, our model, and the ground truth, respectively.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g012.tif"/>
</fig>
<p>Despite these advancements, our study acknowledges areas for further improvement, particularly in optimizing computational efficiency for real-time applications and scaling the model to larger datasets. Specifically, we recognize that, in terms of model size, our proposed AMS-MLP does not demonstrate a significant advantage compared to other models, such as CM-MLP, FCN-16s, U-Net, AttU-Net, and UNet++, as evidenced by our analysis of memory footprint and storage requirements. Nevertheless, we believe our method retains value in other critical aspects, such as potentially offering higher accuracy or efficiency in specific data processing scenarios. Future research will focus on refining architectural designs, exploring advanced training strategies, and actively working on optimizing our method to address these challenges and further elevate segmentation performance. Our ongoing efforts include strategies to reduce resource consumption and enhance overall performance, aiming to better meet practical application needs.</p>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>Ablation study</title>
<p>In this section, we conducted a comprehensive ablation study to systematically evaluate the impact of individual modules on segmentation performance using the MLD dataset. Our baseline model was derived from the BU-Net architecture, with a modified channel configuration to reduce model complexity. To assess the contribution of each module, we adopted a phased integration approach. Initially, we introduced the BAM-MLP model by incorporating the AM-MLP module into the BU-Net architecture. The AM-MLP module enhances the network&#x2019;s ability to capture global context information, which improves the focus on informative regions, thereby boosting segmentation performance. Subsequently, we integrated the MPAM module into the encoder, specifically at the fifth layer, resulting in the BMAM-MLP model. The MPAM module is crucial for generating precise masks, refining the segmentation process by better delineating object boundaries. Further, to explore the synergistic effects of incorporating multiple modules, we tested the BMRD-MLP model, which integrates both the AM-MLP and MCRD modules into the BU-Net architecture. The MCRD module enhances the network&#x2019;s ability to preserve boundary details, improving segmentation accuracy by focusing on fine-grained features. Finally, we developed the AMS-MLP model by progressively combining the AM-MLP, MPAM, and MCRD modules. This multi-module integration demonstrated superior performance across multiple evaluation metrics, illustrating the complementary effects of these modules in enhancing segmentation accuracy. Through this incremental approach, we were able to systematically assess the contribution of each module. As shown in <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref>, the results clearly demonstrate the positive impact of each module on enhancing the overall segmentation performance.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Qualitative comparison for the ablation study on the MLD dataset, and six predicted results are shown. From the 1<sup>st</sup> column to 8<sup>th</sup> column: the original image, the predicted results corresponding to BU-Net, BM-MLP, BAM-MLP, BMAM-MLP, BMRD-MLP, our model, and the ground truth, respectively.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1515105-g013.tif"/>
</fig>
<p>
<xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref> presents the detailed results of all ablation experiments. As illustrated in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>, our study commenced with the BU-Net model, which achieved the following performance metrics: 97.65% accuracy, 93.92% recall, 79.11% precision, 97.96% specificity, 97.74% mIoU, and 85.88% F1-score. To improve upon this baseline, we incorporated the AM-MLP module into the BU-Net architecture, resulting in the BAM-MLP model. A comparative analysis revealed that BAM-MLP outperformed BU-Net across all metrics, achieving 98.37% accuracy, 96.70% recall, 84.20% precision, 98.51% mIoU, and 90.02% F1-score&#x2014;improvements of 0.72%, 2.78%, 0.55%, 0.32%, and 4.14%, respectively. Building on these results, we investigated the synergistic effects of adding more modules to the BAM-MLP model. We first integrated the MPAM module, leading to the BMAM-MLP model, and then included the MCRD module, resulting in the BMRD-MLP model. Both modifications brought about substantial improvements in segmentation performance, with noticeable gains in accuracy, recall, precision, specificity, and F1-score compared to the BAM-MLP model. Lastly, we constructed the AMS-MLP network by incorporating both the MPAM and MCRD modules into the BAM-MLP architecture. The MPAM and MCRD modules effectively harnessed multi-scale features, preserving boundary details and further enhancing segmentation performance. The integration of these three modules&#x2014;AMSS-MLP, MPAM, and MCRD&#x2014;yielded the best overall performance, as demonstrated by a thorough analysis of the combined metrics. This analysis clearly highlights the distinct contributions of each module when applied to the MLD dataset.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>The Compared results for the ablation experiment of pepper leaf segmentation.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">AM-MLP</th>
<th valign="middle" align="center">MPAM</th>
<th valign="middle" align="center">MCRD</th>
<th valign="middle" align="center">Accuracy (%)</th>
<th valign="middle" align="center">Recall (%)</th>
<th valign="middle" align="center">Specificity (%)</th>
<th valign="middle" align="center">Precision (%)</th>
<th valign="middle" align="center">mIoU (%)</th>
<th valign="middle" align="center">F1-score (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">
<bold>Baseline (BU-Net)</bold>
</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">97.65</td>
<td valign="middle" align="center">93.92</td>
<td valign="middle" align="center">79.11</td>
<td valign="middle" align="center">97.96</td>
<td valign="middle" align="center">97.74</td>
<td valign="middle" align="center">85.88</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>BAM-MLP</bold>
</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">98.37</td>
<td valign="middle" align="center">96.70</td>
<td valign="middle" align="center">84.20</td>
<td valign="middle" align="center">98.51</td>
<td valign="middle" align="center">98.06</td>
<td valign="middle" align="center">90.02</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>BMAM-MLP</bold>
</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">99.04</td>
<td valign="middle" align="center">96.78</td>
<td valign="middle" align="center">91.36</td>
<td valign="middle" align="center">99.25</td>
<td valign="middle" align="center">96.88</td>
<td valign="middle" align="center">93.87</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>BMRD-MLP</bold>
</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">98.90</td>
<td valign="middle" align="center">97.02</td>
<td valign="middle" align="center">92.10</td>
<td valign="middle" align="center">99.28</td>
<td valign="middle" align="center">98.18</td>
<td valign="middle" align="center">94.29</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Ours</bold>
</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">
<bold>99.63</bold>
</td>
<td valign="middle" align="center">
<bold>97.98</bold>
</td>
<td valign="middle" align="center">
<bold>97.20</bold>
</td>
<td valign="middle" align="center">
<bold>99.77</bold>
</td>
<td valign="middle" align="center">
<bold>98.28</bold>
</td>
<td valign="middle" align="center">
<bold>97.59</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the best performance metrics in each category.</p>
<p>&#x221a; denotes the module is included in the model architecture for this ablation variant.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>Real-world applications for ground-based mobile disease recognition</title>
<p>The primary goal of this study is to develop a precise pepper leaf segmentation model to support ground-based mobile disease recognition systems. The proposed Adaptive Multi-Scale MLP (AMS-MLP) model is specifically designed to segment diseased leaves from images captured under various pure backgrounds (e.g., palm, ground, or desktop), which are commonly used in ground-based data collection scenarios. By accurately extracting leaves from complex backgrounds, the segmented leaves can be fed into disease recognition models, significantly improving their accuracy and robustness.</p>
<p>The AMS-MLP model is optimized for deployment on mobile devices (e.g., smartphones), making it highly accessible and practical for small-scale farmers. In real-world applications, farmers or agricultural workers can use a smartphone to capture images of pepper leaves placed on different pure backgrounds in the field. The AMS-MLP model will then precisely segment the leaves, enabling subsequent disease recognition models to perform more effectively. This approach is particularly beneficial for farmers who rely on mobile devices for crop monitoring and disease detection, as it provides a cost-effective and user-friendly solution.</p>
<p>While our current focus is on ground-based mobile applications, the AMS-MLP model can also be adapted for other platforms, such as UAVs (drones) or robotic systems, in future work. For example, integrating the model into drones could enable large-scale monitoring of pepper fields, while embedding it into agricultural robots could support automated disease detection and precision farming. However, the immediate application of our research is to enhance the performance of mobile-based disease recognition systems by improving leaf segmentation accuracy under varying backgrounds.</p>
</sec>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusion</title>
<p>Accurate extraction of plant leaves from diverse backgrounds is of significant importance for building robust plant disease recognition models. In this study, we propose a lightweight and high-precision leaf segmentation model specifically designed for extracting pepper leaves in complex and variable backgrounds. The model adopts an encoder-decoder architecture, innovatively integrating an Adaptive Multi-scale MLP (AM-MLP) network, a Multi-scale Pyramid Aggregation Module (MPAM), and a Multi-channel Residual Decoding (MCRD) module. In the encoder, the MPAM module enhances the accuracy of leaf edge feature extraction through cross-layer feature aggregation and single-channel masking. The AM-MLP module employs a dual-branch structure: the Global Multi-scale MLP (GMS-MLP) branch extracts global contextual features, while the Local Multi-scale MLP (LMS-MLP) branch generates local feature maps and optimizes feature representation through a dynamic attention mechanism. The decoder integrates the MCRD module, leveraging convolutional layers to improve boundary localization capabilities. The results demonstrate that the proposed method exhibits excellent robustness and generalization capabilities, achieving mean Intersection over Union (mIoU) scores of 97.39%, 96.91%, and 97.91%, as well as F1 scores of 98.29%, 97.86%, and 98.51%, respectively. Ablation studies further confirm that the progressive integration of the AM-MLP, MPAM, and MCRD modules significantly improves the model&#x2019;s performance across six key evaluation metrics.</p>
<p>Despite the outstanding performance of the proposed AMS-MLP network in pepper leaf segmentation tasks, certain limitations remain. First, the model relies entirely on supervised learning, requiring a large amount of precisely annotated training data. Second, there is still room for optimization in computational efficiency. Based on these observations, we outline the following future research directions: (1) exploring weakly supervised and self-supervised learning methods to reduce dependency on annotated data; (2) investigating model fine-tuning strategies to enhance generalization across different scenarios; (3) further optimizing computational efficiency to meet real-time processing requirements on ground mobile devices; (4) extending the model to platforms such as drones and robotic systems; and (5) improving network architecture design and exploring advanced training strategies to further enhance segmentation performance. These research directions will provide more efficient and versatile solutions for plant leaf segmentation in complex backgrounds.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>JF: Conceptualization, Methodology, Visualization, Writing &#x2013; original draft. HL: Conceptualization, Data curation, Methodology, Writing &#x2013; original draft. SZ: Supervision, Validation, Writing &#x2013; review &amp; editing. HH: Supervision, Validation, Writing &#x2013; review &amp; editing. HG: Data curation, Supervision, Validation, Writing &#x2013; review &amp; editing. YF: Methodology, Supervision, Validation, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s9" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. The research described in this paper was funded by the National Natural Science Foundation of China (No. 61966001, No.62206195, No. 61866001, No. 62163004, No. 61963002, and No. 62206195), the Joint Funds of the Zhejiang Provincial Natural Science Foundation of China (No. LZY23F050001), Natural Science Foundation of Jiangxi Province (No. 20202BABL214032 and No. 20202BABL203035), Science and Technology Plan Project of Taizhou City (No. 22ywa58 and No. 22nya18), Jiangxi Engineering Laboratory on Radioactive Geoscience and Big Data Technology (No.JELRGBDT202201), the Engineering Research Center of Development and Management for Low to Ultra-Low Permeability Oil &amp; Gas Reservoirs in West China(No. KFJJ-XB- 2020-1), and the Open Fund of Key Laboratory of Exploration Technologies for Oil and Gas Resources (No. K2021-02).</p>
</sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alom</surname> <given-names>M. Z.</given-names>
</name>
<name>
<surname>Hasan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yakopcic</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Taha</surname> <given-names>T. M.</given-names>
</name>
<name>
<surname>Asari</surname> <given-names>V. K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Recurrent residual convolutional neural network based on U-Net (R2U-Net) for medical image segmentation</article-title>. <source>arXiv preprint arXiv:1802.06955</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1802.06955</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beikmohammadi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Faez</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Motallebi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>SWP-LeafNET: A novel multistage approach for plant leaf identification based on deep CNN</article-title>. <source>Expert Syst. Appl.</source> <volume>202</volume>, <fpage>117470</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2022.117470</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bhavini</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sheshang</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>A Survey on apple fruit diseases detection and classifcation</article-title>. <source>Int. J. Comput. Appl.</source> <volume>130</volume>, <fpage>25</fpage>&#x2013;<lpage>32</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.5120/IJCA2015907153</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Adeli</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>TransUNet: Transformers make strong encoders for medical image segmentation</article-title>. <source>arXiv preprint arXiv:2102.04306</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2102.04306</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cruz</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Ampatzidis</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Pierro</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Materazzi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Panattoni</surname> <given-names>A.</given-names>
</name>
<name>
<surname>De Bellis</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Detection of grapevine yellows symptoms in Vitis vinifera L. with artificial intelligence</article-title>. <source>Comput. Electron. Agric.</source> <volume>157</volume>, <fpage>63</fpage>&#x2013;<lpage>76</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.12.028</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>AISOA-SSformer: An effective image segmentation method for rice leaf disease based on the transformer architecture</article-title>. <source>Plant Phenomics</source> <volume>6</volume>, <fpage>0218</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1234/pp.2024.001234</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deb</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Garai</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Das</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Dhal</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>LS-Net: a convolutional neural network for leaf segmentation of rosette plants</article-title>. <source>Neural Comput. Applic.</source> <volume>34</volume>, <fpage>18511</fpage>&#x2013;<lpage>18524</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00521-022-07479-9</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Devlin</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>M.-W.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Toutanova</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Bert: Pre-training of deep bidirectional Transformers for language understanding</article-title>. <source>ArXiv Preprint ArXiv: 1810.04805</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ding</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>RepMLPNet: Hierarchical vision MLP with re-parameterized locality</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <fpage>568</fpage>&#x2013;<lpage>577</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Weissenborn</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>. <source>arXiv preprint arXiv:2010.11929</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname> <given-names>D. P.</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>G. P.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>H. Z.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>J. B.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Pranet: Parallel reverse attention network for polyp segmentation</article-title>. <source>arXiv preprint arXiv: 1802.06955</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2006.11392</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>X. D.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>BAF-Net: Bidirectional attention fusion network via CNN and transformers for the pepper leaf segmentation</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>, <elocation-id>1123410</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1123410</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>b). <article-title>Fuzzy region-based active contour driven by global and local fitting energy for image segmentation</article-title>. <source>Appl. Soft Comp.</source> <volume>200</volume>, <fpage>106982</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.asoc.2020.106982</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>a). <article-title>Region-edge-based active contours driven by hybrid and local fuzzy region-based energy for image segmentation</article-title>. <source>Inf. Sci.</source> <volume>546</volume>, <fpage>397</fpage>&#x2013;<lpage>419</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ins.2020.08.078</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fatima Naqvi</surname> <given-names>S. A.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>M. A.</given-names>
</name>
<name>
<surname>Hamza</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Alsenan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Alharbi</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Teng</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Fruit and vegetable leaf disease recognition based on a novel custom convolutional neural network and shallow classifier</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>, <elocation-id>1469685</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1469685</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A lightweight CNN model for pepper leaf disease recognition in a human palm background</article-title>. <source>Heliyon</source> <volume>10</volume>, <elocation-id>e33447</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e33447</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A novel ensemble learning method for crop leaf disease recognition</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>, <elocation-id>1280671</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1280671</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Squeeze-and-excitation networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>7132</fpage>&#x2013;<lpage>7141</lpage>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jothiaruna</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Joseph Abraham Sundar</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Ifjaz Ahmed</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A disease spot segmentation method using comprehensive color feature with multi-resolution channel and region growing</article-title>. <source>Multimedia Tools Appl.</source> <volume>80</volume>, <fpage>3327</fpage>&#x2013;<lpage>3335</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-020-09882-7</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kalaivani</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Shantharajah</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Padma</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Agricultural leaf blight disease seg mentation using indices based histogram intensity segmentation approach</article-title>. <source>Multimedia Tools Appl.</source> <volume>79</volume>, <fpage>9145</fpage>&#x2013;<lpage>9159</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-018-7126-7</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Kang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Attention-guided convolutional neural network for detecting pneumonia on chest x-rays</article-title>,&#x201d; in <conf-name>2019 41st Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)</conf-name>. <fpage>4851</fpage>&#x2013;<lpage>4854</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>P.</given-names>
</name>
<name>
<surname>An</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Pyramid attention network for semantic segmentation</article-title>. <source>arXiv preprint arXiv: 1805.10180</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1805.10180</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>B. L.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Research on the segmentation method of rice leaf disease image</article-title>. <source>Appl. Mech. Mater.</source> <volume>223</volume>, <fpage>1339</fpage>&#x2013;<lpage>1344</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.4028/www.scientific.net/AMM.220-223</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Identification of apple leaf diseases based on deep convolutional neural networks</article-title>. <source>Symmetry</source> <volume>10</volume>, <fpage>11</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/sym10010011</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Long</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shelhamer</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Darrell</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Fully convolutional networks for semantic segmentation</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>3431</fpage>&#x2013;<lpage>3440</lpage>.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>CM-MLP: Cascade multi-scale MLP with axial context relation encoder for edge segmentation of medical image</article-title>. <source>arXiv preprint arXiv: 2208.10701</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2208.10701</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Melas-Kyriazi</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Do you even need attention a stack of feed-forward layers does surprisingly well on imagenet</article-title>. <source>arXiv preprint arXiv: 2105.02723</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2105.02723</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Naik</surname> <given-names>B. N.</given-names>
</name>
<name>
<surname>Malmathanraj</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Palanisamy</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Detection and classification of chilli leaf disease using a squeeze-and-excitation-based CNN model</article-title>. <source>Eco. Inform.</source> <volume>69</volume>, <fpage>101663</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2022.101663</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ngugi</surname> <given-names>L. C.</given-names>
</name>
<name>
<surname>Abdelwahab</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Abo-Zahhad</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A new approach to learning and recognizing leaf diseases from individual lesions using convolutional neural networks</article-title>. <source>Inf. Process. Agric.</source> <volume>10</volume>, <fpage>11</fpage>&#x2013;<lpage>27</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.inpa.2021.10.004</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oktay</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Schlemper</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Folgoc</surname> <given-names>L. L.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Heinrich</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Misawa</surname> <given-names>K.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Attention U-Net: learning where to look for the pancreas</article-title>. <source>arXiv preprint arXiv: 1804.03999</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1804.03999</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pal</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>V.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>AgriDet: Plant leaf disease severity classification using agriculture detection framework</article-title>. <source>Eng. Appl. Artif. Intel.</source> <volume>119</volume>, <fpage>105754</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2022.105754</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-Net: convolutional networks for biomedical image segmentation</article-title>,&#x201d; in <source>Medical Image Computing and Computer-Assisted Intervention - MICCAI 2015 (Lecture Notes in Computer Science)</source>. (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>) <volume>9351</volume>, <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Recurrent mask refinement for few-shot medical image segmentation</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>. <fpage>3918</fpage>&#x2013;<lpage>3928</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Evans</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Segmentation of tomato leaf images based on adaptive clustering number of K-means algorithm</article-title>. <source>Comput. Electron. Agric.</source> <volume>165</volume>, <fpage>104962</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2019.104962</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tolstikhin</surname> <given-names>I. O.</given-names>
</name>
<name>
<surname>Houlsby</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>MLP-Mixer: An all-mlp architecture for vision</article-title>. <source>arXiv preprint arXiv: 2105.01601</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2105.01601</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Talebi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H. F.</given-names>
</name>
<name>
<surname>Yang</surname>
</name>
<name>
<surname>Milanfar</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Bovik</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). &#x201c;<article-title>Maxim: Multi-axis mlp for image processing</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, , <conf-date>June 2022</conf-date>. <fpage>5769</fpage>&#x2013;<lpage>5780</lpage>.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Valanarasu</surname> <given-names>J. M. J.</given-names>
</name>
<name>
<surname>Patel</surname> <given-names>V. M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>UNeXt: MLP-based rapid medical image segmentation network</article-title>. <source>arXiv preprint arXiv: 2203.04967</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2203.04967</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>BiO-Net: Learning recurrent bi-directional connections for encoder-decoder architecture</article-title>. <source>arXiv preprint arXiv: 2007.00243</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2007.00243</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiong</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>The extraction algorithm of color disease spot image based on Otsu and watershed</article-title>. <source>Soft Comput.</source> <volume>24</volume>, <fpage>7253</fpage>&#x2013;<lpage>7263</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00500-019-04339-y</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Spontaneous speech emotion recognition using multiscale deep convolutional LSTM</article-title>. <source>IEEE T. Affect. Comput.</source> <volume>13</volume>, <fpage>680</fpage>&#x2013;<lpage>688</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2019.2947464</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Siddiquee</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Tajbakhsh</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>UNet++: redesigning skip connections to exploit multiscale features in image segmentation</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>39</volume>, <fpage>1856</fpage>&#x2013;<lpage>1867</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMI.2019.2959609</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>