<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2023.1111175</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Cotton leaf segmentation with composite backbone architecture combining convolution and attention</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Yan</surname>
<given-names>Jingkun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2085876"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yan</surname>
<given-names>Tianying</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1074365"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ye</surname>
<given-names>Weixin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lv</surname>
<given-names>Xin</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1979267"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Gao</surname>
<given-names>Pan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1081873"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xu</surname>
<given-names>Wei</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Information Science and Technology, Shihezi University</institution>, <addr-line>Shihezi</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>National-Local Joint Engineering Research Center for Agricultural Big Data, Xinjiang Production and Construction Group</institution>, <addr-line>Shihezi</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>College of Agriculture, Shihezi University</institution>, <addr-line>Shihezi</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Zhanyou Xu, Agricultural Research Service (USDA), United States</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Wenzheng Bao, Xuzhou University of Technology, China; Nisha Pillai, Mississippi State University, United States</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Pan Gao, <email xlink:href="mailto:gp_inf@shzu.edu.cn">gp_inf@shzu.edu.cn</email>;  Wei Xu, <email xlink:href="mailto:xuwei0412@shzu.edu.cn">xuwei0412@shzu.edu.cn</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work and share first authorship</p>
</fn>
<fn fn-type="other" id="fn002">
<p>This article was submitted to Technical Advances in Plant Science, a section of the journal Frontiers in Plant Science</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>31</day>
<month>01</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1111175</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>11</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>01</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Yan, Yan, Ye, Lv, Gao and Xu</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Yan, Yan, Ye, Lv, Gao and Xu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Plant leaf segmentation, especially leaf edge accurate recognition, is the data support for automatically measuring plant phenotypic parameters. However, adjusting the backbone in the current cutting-edge segmentation model for cotton leaf segmentation applications requires various trial and error costs (e.g., expert experience and computing costs). Thus, a simple and effective semantic segmentation architecture (our model) based on the composite backbone was proposed, considering the computational requirements of the mainstream Transformer backbone integrating attention mechanism. The composite backbone was composed of CoAtNet and Xception. CoAtNet integrated the attention mechanism of the Transformers into the convolution operation. The experimental results showed that our model outperformed the benchmark segmentation models PSPNet, DANet, CPNet, and DeepLab v3+ on the cotton leaf dataset, especially on the leaf edge segmentation (MIoU: 0.940, BIoU: 0.608). The composite backbone of our model integrated the convolution of the convolutional neural networks and the attention of the Transformers, which alleviated the computing power requirements of the Transformers under excellent performance. Our model reduces the trial and error cost of adjusting the segmentation model architecture for specific agricultural applications and provides a potential scheme for high-throughput phenotypic feature detection of plants.</p>
</abstract>
<kwd-group>
<kwd>cotton leaf segmentation</kwd>
<kwd>composite backbone</kwd>
<kwd>convolutional neural network</kwd>
<kwd>attention mechanism</kwd>
<kwd>transformer</kwd>
</kwd-group>
<counts>
<fig-count count="9"/>
<table-count count="4"/>
<equation-count count="6"/>
<ref-count count="46"/>
<page-count count="12"/>
<word-count count="6410"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Cotton, the second largest crop after grain, is the primary raw material for daily necessities and the textile industry (<xref ref-type="bibr" rid="B12">Feng et&#xa0;al., 2022</xref>). However, biotic stress and abiotic stress existing in cotton production affect the yield and quality (<xref ref-type="bibr" rid="B44">Zhang et&#xa0;al., 2022</xref>). To ensure sustainable cotton production, breeders must identify quality varieties through continuous monitoring of cotton phenotypic traits (<xref ref-type="bibr" rid="B41">Ye, 2014</xref>). Budding, flowering, and boll periods are significant growth stages of cotton, which are directly reflected in cotton leaves due to the influence of nutrition, diseases, and insect pests, and thus determine the subsequent growth and yield of cotton (<xref ref-type="bibr" rid="B29">Mubarik et&#xa0;al., 2020</xref>). Breeders screen the appropriate cotton varieties during the budding, flowering, and boll period, based on estimates of plant disease resistance and yield reflected by closely related leaf phenotypic traits (e.g., Leaf Length, Leaf Area Index) (<xref ref-type="bibr" rid="B31">Saeed et&#xa0;al., 2021</xref>). Manual sampling in complex field environments is a natural way to measure cotton leaf phenotypic parameters. However, manual sampling is a labor-intensive, time-consuming, and disruptive process (<xref ref-type="bibr" rid="B2">Bao et&#xa0;al., 2021</xref>). Image segmentation of computer vision is a standard approach for non-destructive sampling samples in complex field environments. The image segmentation algorithm can automatically separate the processed samples to be processed. Therefore, image segmentation has gradually become a potential preprocessing approach of sample separation for rapidly measuring plant phenotypic parameters.</p>
<p>With advances in computing power (e.g., GPU), deep learning with powerful nonlinear and robust generalization ability replaces the traditional image segmentation algorithm, which highly relies on expert experience (<xref ref-type="bibr" rid="B34">Taghanaki et&#xa0;al., 2020</xref>). Generally speaking, the segmentation models based on deep learning are composed of encoders and decoders, such as PSPNet (<xref ref-type="bibr" rid="B45">Zhao et&#xa0;al., 2017</xref>), DANet (<xref ref-type="bibr" rid="B13">Fu et&#xa0;al., 2019</xref>), CPNet (<xref ref-type="bibr" rid="B43">Yu et&#xa0;al., 2020</xref>), DeepLab v3+ (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2018</xref>). Specifically, the backbones of the segmentation models in the encoder are used to extract features (<xref ref-type="bibr" rid="B27">Miao et&#xa0;al., 2020</xref>). The feature diversity of backbone extraction determines the performance of the segmentation model (<xref ref-type="bibr" rid="B28">Minaee et&#xa0;al., 2022</xref>). Currently, convolutional neural networks (CNNs, e.g., ResNet-101, Xception) with deep stacked convolution structures to represent powerful features have gradually become mainstream feature extractors. PSPNet utilizes ResNet-101 as a backbone to achieve an elegant expression in the complex field environment of grape segmentation (<xref ref-type="bibr" rid="B6">Chen et&#xa0;al., 2021</xref>). DeepLab v3+ employs ResNet-101/Xception as a backbone to segment fruit plaques (<xref ref-type="bibr" rid="B25">Li et&#xa0;al., 2022b</xref>; <xref ref-type="bibr" rid="B42">Yuan et&#xa0;al., 2022</xref>), and also attempts to segment cotton roots (<xref ref-type="bibr" rid="B16">Kang et&#xa0;al., 2021</xref>).</p>
<p>CNNs have been widely used in plant phenotype, especially phenotype segmentation. However, CNNs have apparent disadvantages, such as poor learning ability of low-level features of images and partial neglect of global information, which limit the accurate segmentation of object edges in complex field environments (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2018</xref>). Due to the complexity of the leaf environment, the morphological characteristics (texture, size, and shape) of the leaf change accordingly, and the segmentation of the leaf edge has the dilemma of over-segmentation/under-segmentation (<xref ref-type="bibr" rid="B39">Yang et&#xa0;al., 2020</xref>). Transformers, as attention models, achieve powerful accuracy for large-scale datasets with a robust representation of global context (<xref ref-type="bibr" rid="B10">Dosovitskiy et&#xa0;al., 2021</xref>). In contrast, CNNs with deep stacked convolution structures embedded in the attention modules, e.g., Channel Attention Module (<xref ref-type="bibr" rid="B37">Woo et&#xa0;al., 2018</xref>), and Convolution Block Attention Module (<xref ref-type="bibr" rid="B37">Woo et&#xa0;al., 2018</xref>), integrate global information to a limited extent, and improve the power slightly of object edge segmentation. Thus, with the success of self-attention models such as Transformers, much previous work has attempted to bring the power of attention to computer vision (<xref ref-type="bibr" rid="B17">Khan et&#xa0;al., 2022</xref>).</p>
<p>Recently, Transformer-based backbones have shown potential performance and expanded cutting-edge applications. <xref ref-type="bibr" rid="B20">Li et&#xa0;al. (2022a)</xref> proposed an automatic pest recognition method based on Vision Transformer (ViT) in PlantVillage (a public dataset of plant pests and diseases) (<xref ref-type="bibr" rid="B15">Hughes and Salath&#xe9;, 2015</xref>). <xref ref-type="bibr" rid="B30">Reedha et&#xa0;al. (2022)</xref> proposed a novel crop recognition model using ViT based on unmanned aerial vehicles (UAV) remote sensing images. <xref ref-type="bibr" rid="B38">Wu et&#xa0;al. (2021)</xref> proposed a multi-scale feature extraction model based on a visual converter to identify crop disease types. However, the large model capacity with huge parameters and high computational power required by Transformers hinders rapid application to agricultural tasks (<xref ref-type="bibr" rid="B17">Khan et&#xa0;al., 2022</xref>). The attention of Transformers has slight inductive bias and weak generalization on the relatively small amount of datasets compared with the convolution of CNNs (<xref ref-type="bibr" rid="B10">Dosovitskiy et&#xa0;al., 2021</xref>).</p>
<p>In relatively small agricultural data sets, plant phenotype researchers have used the Transformer and CNN cascade model, incorporating the inductive bias of CNNs and the self-attention mechanism of Transformers, to study plant phenotype. <xref ref-type="bibr" rid="B36">Wang et&#xa0;al. (2022)</xref> proposed a crop segmentation method of remote sensing images based on a barely remote sensing dataset by constructing a novel architecture of coupling CNN and Transformer. <xref ref-type="bibr" rid="B22">Liu et&#xa0;al. (2022)</xref> attempted to propose a CNN-Transformer network with Multi-Scale Context Aggregation (MSCANet) and realize efficient and effective farmland change detection. However, Transformer and CNN cascade models integrate the respective advantages of Transformers and CNNs, and the computational cost and data requirements of Transformers are also introduced into the cascade models, which hinders the rapid promotion of the cascade models in agriculture. Therefore, for the global learning potential of the self-attention mechanism of Transformers and the fast application limitation of Transformers required computing power and large-scale datasets, the models combining convolution of CNNs and self-attention of Transformers have become a new research direction. CoAtNet (<xref ref-type="bibr" rid="B8">Dai et&#xa0;al., 2021</xref>), as a novel backbone, incorporates the global awareness of Transformers and the inductive bias of CNNs.</p>
<p>Different from Transformer and CNN Cascade Models, CoAtNet introduces CNN convolution and Transformer attention to alleviate computational power greed. The classification speed and accuracy of CoAtNet in ImageNet demonstrate the potential of CoAtNet as a backbone for segmentation models. However, the robust backbone design of the segmentation models requires substantial trial-and-error costs (e.g., expert experience and computational costs). As the backbone architecture of automatic search, neural architecture search (NAS) (<xref ref-type="bibr" rid="B46">Zoph and Le, 2017</xref>) still has the computational cost of architecture search. Therefore, for backbone design, simple and effective strategies are urgently needed for rapid application in agriculture. CBNet (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2020</xref>) and CBNetV2 (<xref ref-type="bibr" rid="B19">Liang et&#xa0;al., 2021</xref>) proposed the architectures integrating multiple backbones into a composite backbone for object detection, which assembles multiple existing backbones in parallel to represent various features, reducing the computational cost of architecture design. Inspired by CBNet and CBNetV2, a leaf segmentation architecture based on composite backbone architecture was proposed and explored.</p>
<p>To the best of our knowledge, the encoder-decoder architecture segmentation model has over-segmentation and under-segmentation in complex field environments. Among the encoders of the segmentation models, the design of a robust backbone can alleviate segmentation anomalies, especially the mainstream CNNs and Transformers. CNNs are highlighted by inductive learning and generalization, while Transformers are highlighted by global semantics. However, Transformers and cotton-leaf segmentation architecture design is power consumption. Therefore, this work aims to explore the application of the composite backbone architecture combined with the convolution of CNNs and the attention of Transformers in cotton leaf segmentation without significantly introducing the computational power requirements of Transformers. The specific objectives achieved herein are as follows:</p>
<list list-type="simple">
<list-item>
<p>(1) Eight hundred images of budding, flowering, and boll period cotton leaves in five typical complex field environments (normal, spotted lesions, regional lesions, occluded blades, uneven illumination) were collected and labeled.</p>
</list-item>
<list-item>
<p>(2) CoAtNet, which incorporates the attention mechanism of Transformers into the convolution, was explored as the backbone of the encoder in the cotton leaf segmentation architecture.</p>
</list-item>
<list-item>
<p>(3) A simple and effective composite backbone (Xception and CoAtNet) leaf segmentation architecture combining convolution and attention was designed to fully learn the edge information and global context of cotton leaves.</p>
</list-item>
</list>
<p>An outline is employed to show the detailed steps of this work in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. Our model is based on the encoder-decoder architecture of DeepLab v3+, and the composite backbone is introduced into our model. In step 1, Xception and CoAtNet are used as the lead backbone and assisting backbone in the composite backbone, and the features of the input image are first extracted by assisting backbone. In step 2, the output features of each stage of the assisting backbone flow to parallel and lower stages of the lead backbone. Xception learns the richer multi-level features of the assisting backbone. In step 3, the fusion mechanism of weight contribution factors is adopted to suppress unimportant features from different backbones. The fused features flow to the lead backbone under the batch-normalized channel weight contribution factor. Finally, the output of the composite backbone is applied to the encoder and decoder.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The outline of the composite backbone and segmentation architecture in our model. In step 1, the composite backbone (Xception and CoAtNet) is selected, and the features of the input image are extracted by CoAtNet. In step 2, multi-scale features are interacted in the composite backbone. In step 3, features from the composite backbone are fused using weight contribution factors.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g001.tif"/>
</fig>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<p>In this section, Section 2.1 introduced the subdivision of the cotton dataset into acquisition and preprocessing. Then, Section 2.2 illustrated the design of the segmentation model, including the model framework and the composite backbone. Finally, Section 2.3 introduced the experimental details, including the experimental structure, training, and testing strategy.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Data description</title>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Acquisition</title>
<p>Cotton crops were grown in the field at the experimental station (85&#xb0;9&#x2032;51.231 00&#x2032;&#x2032;E, 44&#xb0;35&#x2032;47.720 00&#x2032;&#x2032;N) of the Agricultural College of Shihezi University, Shihezi, China. The cotton variety &#x201c;Xinluzao 54&#x201d; was trial-planted on April 7, 2021, and the sowing density was ten seeds/square meter. Specifically, the column spacing was 0.2&#xa0;m, and the row spacing was 0.3&#xa0;m. The images were acquired along the rows over the entire field on six experimental dates in the budding, flowering and boll period (June 11, June 18, June 23, July 7, July 13, and July 22). Multiple smartphones were selected to capture images and verify the generality of the subsequent segmentation models. The smartphone cameras were set to manual operation mode, with a distance of about 0.3&#xa0;m from the target leaves. Specifically, the target leaves were photographed in natural light (9:00-12:00 a.m., Beijing Time). The following five types of cotton leaves were typical research objects, as shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>.</p>
<list list-type="bullet">
<list-item>
<p>Normal leaves;</p>
</list-item>
<list-item>
<p>Leaves with spotted lesions;</p>
</list-item>
<list-item>
<p>Leaves with regional lesions;</p>
</list-item>
<list-item>
<p>Leaves with occluded blades;</p>
</list-item>
<list-item>
<p>Leaves with uneven illumination.</p>
</list-item>
</list>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Images of the Cotton Leaf dataset. The dataset is divided into five representative leaves: <bold>(A)</bold> a normal cotton leaf, <bold>(B)</bold> a cotton leaf with spotted lesions, <bold>(C)</bold> a cotton leaf with regional lesions, <bold>(D)</bold> a cotton leaf with occluded blades, and <bold>(E)</bold> a cotton leaf with uneven illumination.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g002.tif"/>
</fig>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Preprocess</title>
<p>The median filtering algorithm was applied to image preprocessing since a certain amount of image noise caused by external factors would negatively impact the training of segmentation models. Moreover, the image resolution was adjusted to 512&#xd7;512 pixels before annotation, saving computational resources and labor handling time. Subsequently, the polygons pattern in Labelme-3.3.6 (<xref ref-type="bibr" rid="B35">Torralba et&#xa0;al., 2010</xref>) provided labels for two semantic classes of the dataset, including foreground (target leaves) and background (i.e., soil, weeds, other leaves). The image annotation process is shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. The diversity of leaf images under different growth periods was considered, and at least 100 images were labeled from five typical cotton leaves in the budding, flowering, and boll period.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Image annotation process. The left is the input image, and the right is the labeled image.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g003.tif"/>
</fig>
<p>The size and diversity of the dataset affect the segmentation model performance (<xref ref-type="bibr" rid="B3">Barbedo, 2018</xref>). Specifically, large-scale datasets are a prerequisite for building reliable segmentation models, while limited datasets easily lead to model overfitting. Therefore, a series of operations was adopted to expand the cotton leaf dataset: rotation and mirror flip. The final cotton leaf dataset containing 800 images and segmentation labels was divided into 80% training dataset and 20% testing dataset for training and testing subsequent segmentation models.</p>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Model design</title>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>Framework</title>
<p>Currently, the performing segmentation models rely heavily on the backbones. Intuitively, the rich feature maps extracted by the backbones and the vast receptive fields sensed by the backbones determine the segmentation model performance (<xref ref-type="bibr" rid="B26">Ma et&#xa0;al., 2020</xref>). However, designing and pre-training a new backbone consumes various computing resources, and requires a large number of training samples (<xref ref-type="bibr" rid="B1">Bao et&#xa0;al., 2022</xref>). Recently, the application of composite backbone in object detection has inspired our model (<xref ref-type="bibr" rid="B19">Liang et&#xa0;al., 2021</xref>). A composite backbone combines several existing networks and then integrates the rich features of multiple scales. In addition, previous studies have shown that the feature pyramid network (FPN) is more effective than simple network deepening or broadening. Top-down paths of FPN introduce spatially richer and semantically more powerful high-level features and enhance low-level features in bottom-up paths of FPN. Thus, in our model, multiple backbones are composited and called assisting backbone and lead backbone, respectively. The composite backbone of our model extended FPN (<xref ref-type="bibr" rid="B21">Lin et&#xa0;al., 2017</xref>) idea combines high-level and low-level features from multiple networks.</p>
<p>As a classical semantic segmentation model, DeepLab v3+ (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2018</xref>) is used as the benchmark for segmentation models. Therefore, DeepLab v3+ is regarded as the prototype of our model, and the lead backbone is the Xception applicable to segmentation in the raw DeepLab v3+. However, DeepLab v3+ still does not fully show excellent potential performance and only tries mature convolutional neural networks (CNN) as a backbone. As a Backbone, simple CNN has the problems of missing global information and tiny local receptive fields, which cannot meet the requirements of DeepLab v3+ for feature maps. In addition, CoAtNet (<xref ref-type="bibr" rid="B8">Dai et&#xa0;al., 2021</xref>) integrates the attention mechanism of Transformers into the convolution operation of CNN, maintaining the optimal tradeoff between model generalization capability and model capacity. Therefore, the hybrid family of CoAtNet is used as the assisting backbone of our model (based on DeepLab v3+).</p>
<p>As shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>, our model is based on the encoder-decoder architecture of DeepLab v3+. Our model uses Xception (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2018</xref>) and CoAtNet (<xref ref-type="bibr" rid="B8">Dai et&#xa0;al., 2021</xref>) as the lead backbone and assisting backbone. In addition, our model is inspired by FPN and contains long-skip connections from the encoding path to the decoding path and short-skip connections between the composite backbone. Long-skip connections transmit low-level features and high-level features. Short-skip connections fuse assisting backbone and lead backbone features, and transmit to the lead backbone.</p>
<p>The remaining parts retain the original architecture of DeepLab v3+. The encoder of the atrous spatial pyramid pool (<xref ref-type="bibr" rid="B5">Chen et&#xa0;al., 2017</xref>) module processes the lead backbone output features with five different operations, namely 1&#xd7;1 convolution, 3&#xd7;3 convolution at dilation rate 6, 3&#xd7;3 convolution at dilation rate 12, 3&#xd7;3 convolution at dilation rate 18, and Image Pooling. The output features of five different operations are downsampled to 1/16 of the input image size and then combined to form multi-scale features. The multi-scale features are then subjected to 1&#xd7;1 convolution operation to form high-level features. The low-level features output by the assisting backbone A1 are combined and fused with the high-level features four times up-sampled after the 1&#xd7;1 convolution operation. The low and high-level fusion features are restored to the input image size by 3&#xd7;3 convolution and four times upsampling. In our model, two dropout layers are added before the last four times upsampling layers to avoid overfitting. The softmax function finally activates our model. Each channel value of the activation output represents the category probability, and the maximum probability value determines the pixel category.</p>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Backbone</title>
<p>Our model is based on CoAtNet and Xception as the composite backbone. As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, the official Xception backbone for segmentation is retained as the lead backbone. In our model, the lead backbone and assisting backbone are divided into five standard blocks, which are L0, L1, L2, L3, and L4 of the lead backbone, and A0, A1, A2, A3, and A4 of the assisting backbone in turn. Concretely, our model divides Xception into five modules, L0, L1, L2, L3, and L4, according to the remaining residual connection after the first residual connection. Modules L0, L1, L2, L3, and L4 are composed of only 3&#xd7;3 separable convolution to reduce computational power requirements. The L3 module is repeated 16 times to learn the image features fully. The rest consists of 3&#xd7;3 convolution and 3&#xd7;3 separable convolution. 1&#xd7;1 convolution achieves feature channel rise and residual transfer. In Xception, the number of channels of the feature map increases successively, and the partial convolution step is set to 2 to fully capture the spatial information of the feature map and reduce the spatial resolution.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Xception. C represents the number of feature output channels.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g004.tif"/>
</fig>
<p>As shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, in our model, the assisting backbone consists of three convolution modules, A0, A1, and A2, and two self-attention modules, A3 and A4. The A0 module consists only of 3&#xd7;3 convolution, which reduces the feature spatial resolution. Modules A1 and A2 are expanded by the attention mechanism of MobileNet consisting of 1&#xd7;1 convolution and 3&#xd7;3 separable convolution (MBConv module with inverted bottleneck structure) (<xref ref-type="bibr" rid="B32">Sandler et&#xa0;al., 2018</xref>). 1&#xd7;1 convolution is used to increase and reduce the dimension of the feature. A3 and A4 modules contain a Relative-Attention (Rel-Attention) layer and a Feed-Forward Network (FFN) layer for learning global feature information. The modules A1, A2, A3, and A4, are successively repeated 2, 4, 8, and 2 times to explore the features fully. The rest consists of global pooling and a fully connected (FC) layer. The residual connection is guaranteed to reduce the model complexity to reduce overfitting, while the residual connection prevents the gradient from disappearing. Specifically, 1&#xd7;1 convolution carries out feature channel dimension raising and completes the residual transfer.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>CoAtNet. CoAtNet is divided into five modules: three convolution modules, A0, A1, A2, and two self-attention modules, A3 and A4. C represents the number of feature output channels. E is the n-time expansion rate of the Feed-Forward Network (FFN) layer.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g005.tif"/>
</fig>
<p>The Rel-Attention layer expands the attention mechanism of Transformers. The Rel-Attention layer stretches the input features from three-dimensional to two-dimensional, that is, h&#xd7;w&#xd7;c to (h&#xd7;w)&#xd7;c, and then gets the Input Embeddings. The trainable weight matrices of Queries, Keys, and Values are calculated by the Input Embeddings with the full connection. Intuitively, the two-dimensional matrix Queries, Keys, and Values all contain feature global information. The Score matrix is computed by the scalar product of Queries and Keys. The Score matrix represents the correlation between each one-dimensional vector in Keys and each one-dimensional vector in Queries. Further, the Score matrix is scaled and activated by the softmax function. Then, the Attention Matrix is obtained by calculating the scalar product between the Score matrix and Values, which contain relative global attention features of each one-dimensional vector in the three matrices of Queries, Keys, and Values. Finally, the Attention Matrix is reconverted into three dimensions to obtain the output features.</p>
<p>The FFN layer learns advanced image features from the MBConv block. The Input Embeddings are expanded by an FFN layer consisting of multiple FC layers with an n- time expansion rate and then resized to the original size. In our model, the number of feature channels in the FFN layer inflation factor was set to 4.</p>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Composite</title>
<p>Backbone, or feature extractor, as the initial stage of the semantic segmentation network, plays a significant role in model segmentation performance (<xref ref-type="bibr" rid="B11">Fan et&#xa0;al., 2018</xref>). Backbone provides the basic features of the segmentation target for the semantic segmentation model. Our model draws on the ideas of FPN (<xref ref-type="bibr" rid="B21">Lin et&#xa0;al., 2017</xref>) and CBNetV2 (<xref ref-type="bibr" rid="B19">Liang et&#xa0;al., 2021</xref>) architecture to construct the connection structure between the lead backbone and the assisting backbone. As shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, the output features of modules A0, A1, A2, A3, and A4 of CoAtNet flow to parallel and lower-level jump connections of Xception. Xception both preserves the original residual connection and learns the richer multi-level features of the assisting backbone. Specifically, the output feature maps of modules A0, A1, A2, A3, and A4 are consistent with the dimension of the output feature maps of Xception and skip-connections of lower stages by 1&#xd7;1 convolution. Subsequently, linear interpolation keeps the output feature maps of A0, A1, A2, A3, and A4 modules consistent with the spatial resolution of the output feature maps at parallel and lower skip-connections of Xception. Finally, the output feature maps of modules A0, A1, A2, A3, and A4 are element-summed with the output feature maps at parallel and lower-level skip-connections of Xception.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Our composite backbone architecture with CoAtNet as assisting backbone.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g006.tif"/>
</fig>
<p>The output of each stage of the assisting backbone flows to parallel and lower stages of the lead backbone. The output of the lead backbone is applied to downstream tasks. Different from the simple network deepening or broadening, the composite backbone, which integrates the high and low-level features of the composite backbone, gradually expands the receiving field and provides richer target information. Due to the different response values of the multi-level features integrating the composite backbone, the model is prone to convergence dilemmas. Inspired by the accelerated convergence of normalization (<xref ref-type="bibr" rid="B40">Yan et&#xa0;al., 2020</xref>), our model adopts the fusion mechanism of weight contribution factors to suppress unimportant features, as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>. The fused features flow to the lead backbone of Xception under the batch-normalized channel weight contribution factor.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Fusion mechanism of weight contribution factors based on batch normalization. Where, <italic>&#x3b3;</italic>
<sub>
<italic>i</italic>
</sub> represents the weight value of the i-th channel calculated in batch normalization, <italic>&#x3b3;</italic>
<sub>
<italic>j</italic>
</sub> represents the weight value of the j-th channel calculated in batch normalization, <italic>&#x3c9;</italic>
<sub>
<italic>i</italic>
</sub> represents the importance degree of the i-th channel.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g007.tif"/>
</fig>
</sec>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Experiment</title>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>Experimental detail</title>
<sec id="s2_3_1_1">
<label>2.3.1.1</label>
<title>Hardware</title>
<p>Experiments were conducted with the following hardware configurations: Intel(R) Core(TM) i7-11700 K CPU, 128GB memory, and NVIDIA GeForce RTX3090 graphics card.</p>
</sec>
<sec id="s2_3_1_2">
<label>2.3.1.2</label>
<title>Software</title>
<p>The deep learning framework PyTorch installed in Windows 10 (Microsoft, United States) was adopted to build neural network models.</p>
</sec>
<sec id="s2_3_1_3">
<label>2.3.1.3</label>
<title>Loss function</title>
<p>Models were optimized by the cross-entropy loss (cost) function (<xref ref-type="bibr" rid="B14">Huang et&#xa0;al., 2016</xref>). As shown in Equation (1), <italic>y</italic>
<sub>
<italic>i</italic>
</sub> represents the label of the pixel, <italic>p</italic>
<sub>
<italic>i</italic>
</sub> represents the predicted value of the pixel, and <italic>m</italic> represents the number of pixels in the image.</p>
<disp-formula>
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>m</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mi>log</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mi>log</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The composite backbone was applied in our model to train the original cross-entropy loss. The assisting backbone, which inherited the assistant loss concept of CPNet, was also used to produce assistant supervision. In other words, original cross-entropy loss bears the greatest responsibility, and assistant supervision helps to optimize the learning process. Meanwhile, super parameter weight was added to balance the assistant supervision. The loss defined in our model is as Equation (2).</p>
<disp-formula>
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>L</italic>
<sub>
<italic>Comp</italic>
</sub> is the loss of the composite backbone from input to output,is the loss of assisting backbone from the input only through the low-feature path to the output, and <italic>&#x3bb;</italic> is the super parameter weight for the assistant supervision. In our model, <italic>&#x3bb;</italic> was set to 0.3 according to our empirical experiments.</p>
</sec>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>Training strategy</title>
<p>Two training strategies were used on the cotton leaf dataset for our model. In the first strategy, our model was trained from scratch. In the second strategy, to use the leaf information of the source domain and effectively transfer knowledge to the target domain, the PlantVillage (<xref ref-type="bibr" rid="B15">Hughes and Salath&#xe9;, 2015</xref>) dataset consisting of crop leaf images was first used to pre-train the lead backbone and the assisting backbone. The composite backbone with pre-trained weights in a fine-tuning paradigm of the training process to achieve fast learning on the cotton leaf dataset. In particular, in the fine-tuning paradigm, the composite backbones were frozen to train the encoder-decoder part of our model fully. Then, the composite backbones were unfrozen to complete the rest after the model was trained for a certain epoch.</p>
<p>The parameter setting in training from scratch is shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>, and the parameter setting in fine-tuning is shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. The optimizer of our model was the adaptive moment estimation optimizer (Adam) (<xref ref-type="bibr" rid="B18">Kingma and Ba, 2015</xref>). In Adam, the first and second moments of the gradient were used to update and correct the current learning rate (<xref ref-type="bibr" rid="B9">Dong et&#xa0;al., 2017</xref>). More importantly, if the loss did not improve for more than five epochs during the training, the minimum learning rate was set to 0. Otherwise, the learning rate would drop by 1/2, and the model would continue to train at that learning rate. The model would stop training until the loss no longer changes significantly or until the maximum number of iterations was reached.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>The parameter setting in training from scratch.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Optimizer</th>
<th valign="middle" align="center">Learning rate</th>
<th valign="middle" align="center">Batch size</th>
<th valign="middle" align="center">Epochs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Adam</td>
<td valign="middle" align="center">5e-4</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">200</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The parameter setting in fine-tuning training.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Training Stage</th>
<th valign="middle" align="center">Optimizer</th>
<th valign="middle" align="center">Learning rate</th>
<th valign="middle" align="center">Batch size</th>
<th valign="middle" align="center">Epochs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Backbone freezing</td>
<td valign="middle" align="left">Adam</td>
<td valign="middle" align="center">1e-4</td>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">100</td>
</tr>
<tr>
<td valign="middle" align="left">Fine-tuning</td>
<td valign="middle" align="left">Adam</td>
<td valign="middle" align="center">5e-5</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">100</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_3_3">
<label>2.3.3</label>
<title>Testing strategy</title>
<p>Pixel Accuracy (PA), Mean Pixel Accuracy (MPA), and Mean Intersection over Union (MIoU) (<xref ref-type="bibr" rid="B33">Shelhamer et&#xa0;al., 2015</xref>) are used to evaluate the effect of our model, as shown in Equation (3), (4) and (5).</p>
<disp-formula>
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>A</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>A</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>I</mml:mi>
<mml:mtext>o</mml:mtext>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where, <italic>k</italic> represents the number of classes, <italic>i</italic> represents the true value, <italic>j</italic> represents the predicted value, and <italic>p</italic>
<sub>
<italic>ij</italic>
</sub> represents the pixels that predict class <italic>i</italic> as class <italic>j</italic> . Generally, <italic>p</italic>
<sub>
<italic>ii</italic>
</sub> represents real samples (TP), <italic>p</italic>
<sub>
<italic>ij</italic>
</sub> represents false negative samples (FN), and <italic>p</italic>
<sub>
<italic>ji</italic>
</sub> represents false-positive samples (FP).</p>
<p>However, the MIoU score is higher than the true value when measuring the boundary quality, which cannot gracefully evaluate the segmentation results of our model. Accordingly, Boundary Intersection over Union (BIoU) is introduced as an additional evaluation metric to compare the segmentation fineness better (<xref ref-type="bibr" rid="B4">Cheng et&#xa0;al., 2021</xref>). BIoU is used to evaluate the boundary quality of segmented objects based on the sensitivity of boundary error. BIoU is defined as Equation (6).</p>
<disp-formula>
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>I</mml:mi>
<mml:mtext>o</mml:mtext>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2229;</mml:mo>
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2229;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2229;</mml:mo>
<mml:mi>P</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2229;</mml:mo>
<mml:mi>G</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mstyle displaystyle="true">
<mml:mo>&#x222a;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2229;</mml:mo>
<mml:mi>P</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>G</italic> denotes the ground truth binary mask, <italic>P</italic> denotes the prediction binary mask, and <italic>d</italic> denotes the pixel width of the boundary region. Boundary regions <italic>G</italic>
<sub>
<italic>d</italic>
</sub> and <italic>P</italic>
<sub>
<italic>d</italic>
</sub> are the sets of all pixels within <italic>d</italic> pixels distance from the ground truth and prediction contours, respectively.</p>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results|discussion">
<label>3</label>
<title>Results and discussion</title>
<sec id="s3_1">
<label>3.1</label>
<title>Segmentation model comparison experiment</title>
<p>Segmentation models adopt the experimental setting in Section 2.3.2 for training to make the comparison fair. The performance of segmentation models in training from scratch is shown in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>, and the implementation of segmentation models in fine-tuning is shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>The performance of segmentation Models in training from scratch.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="center">Backbone</th>
<th valign="middle" align="center">Multi-scale Fusion</th>
<th valign="middle" align="center">Attention</th>
<th valign="middle" align="center">Assistant Supervision</th>
<th valign="middle" align="center">BIoU</th>
<th valign="middle" align="center">MIoU</th>
<th valign="middle" align="center">MPA</th>
<th valign="middle" align="center">PA</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">PSPNet</td>
<td valign="middle" align="left">ResNet-101</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.415</td>
<td valign="middle" align="center">0.826</td>
<td valign="middle" align="center">0.869</td>
<td valign="middle" align="center">0.877</td>
</tr>
<tr>
<td valign="middle" align="left">DANet</td>
<td valign="middle" align="left">ResNet-101</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.488</td>
<td valign="middle" align="center">0.883</td>
<td valign="middle" align="center">0.917</td>
<td valign="middle" align="center">0.933</td>
</tr>
<tr>
<td valign="middle" align="left">CPNet</td>
<td valign="middle" align="left">ResNet-101</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">0.497</td>
<td valign="middle" align="center">0.896</td>
<td valign="middle" align="center">0.927</td>
<td valign="middle" align="center">0.941</td>
</tr>
<tr>
<td valign="middle" align="left">DeepLabv3+</td>
<td valign="middle" align="left">Xception</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.522</td>
<td valign="middle" align="center">0.911</td>
<td valign="middle" align="center">0.951</td>
<td valign="middle" align="center">0.967</td>
</tr>
<tr>
<td valign="middle" align="left">Ours</td>
<td valign="middle" align="left">Composite<break/>(Xception + CoAtNet)</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">
<bold>0.583</bold>
</td>
<td valign="middle" align="center">
<bold>0.924</bold>
</td>
<td valign="middle" align="center">
<bold>0.964</bold>
</td>
<td valign="middle" align="center">
<bold>0.972</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate the maximum value in their columns.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>The performance of segmentation Models in fine-tuning training.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="center">Backbone</th>
<th valign="middle" align="center">Multi-scale Fusion</th>
<th valign="middle" align="center">Attention</th>
<th valign="middle" align="center">Assistant Supervision</th>
<th valign="middle" align="center">BIoU</th>
<th valign="middle" align="center">MIoU</th>
<th valign="middle" align="center">MPA</th>
<th valign="middle" align="center">PA</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">PSPNet</td>
<td valign="middle" align="left">ResNet-101</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.438</td>
<td valign="middle" align="center">0.866</td>
<td valign="middle" align="center">0.893</td>
<td valign="middle" align="center">0.901</td>
</tr>
<tr>
<td valign="middle" align="left">DANet</td>
<td valign="middle" align="left">ResNet-101</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.513</td>
<td valign="middle" align="center">0.899</td>
<td valign="middle" align="center">0.925</td>
<td valign="middle" align="center">0.943</td>
</tr>
<tr>
<td valign="middle" align="left">CPNet</td>
<td valign="middle" align="left">ResNet-101</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">0.533</td>
<td valign="middle" align="center">0.911</td>
<td valign="middle" align="center">0.937</td>
<td valign="middle" align="center">0.953</td>
</tr>
<tr>
<td valign="middle" align="left">DeepLabv3+</td>
<td valign="middle" align="left">Xception</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">0.565</td>
<td valign="middle" align="center">0.923</td>
<td valign="middle" align="center">0.957</td>
<td valign="middle" align="center">0.972</td>
</tr>
<tr>
<td valign="middle" align="left">Ours</td>
<td valign="middle" align="left">Composite<break/>(Xception + CoAtNet)</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">&#x25ef;</td>
<td valign="middle" align="center">
<bold>0.608</bold>
</td>
<td valign="middle" align="center">
<bold>0.940</bold>
</td>
<td valign="middle" align="center">
<bold>0.975</bold>
</td>
<td valign="middle" align="center">
<bold>0.979</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate the maximum value in their columns.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Compared with the encouragement success of training from scratch, the evaluation indexes (BIoU and MIoU) of each segmentation model in fine-tuning training were improved accordingly. In addition, among the two training strategies, PSPNet fused multi-scale features to obtain the baseline effect in the cotton leaf segmentation task under complex background. DANet inherited the attention mechanism to improve the cotton leaf segmentation task. CPNet had achieved moderate results without multi-scale feature fusion and attention mechanism, considering assistant supervision strategy. DeepLab v3+ took a mature CNN (Xception) as a backbone, which was the benchmark level in several standard segmentation models, both in MIoU, which represented the overall segmentation quality of the cotton leaf, and in BIoU, which meant the segmentation quality of the leaf edge.</p>
<p>Our model had significant progress compared with DeepLab v3+. Specifically, among MIoU with already high ratings, our model increased by about 1%, due to data limitations or task bottlenecks with an inconspicuous rise. However, in BIoU, our model improvement was quite noticeable, with an increase of around 5%. Without loss of generality, the BIoU was enhanced due to the composite backbone (Xception + CoAtNet). The introduction of our composite backbone not only guaranteed the generalization ability and convergence ability based on Xception, but also had the global receptive field of the self-attention layer based on CoAtNet. The global information ensured that our model worked more accurately in cotton leaf edge segmentation. Due to the structure of the composite backbone, multi-level features were obtained by the encoder and decoder of our model, thus enabling the edge pixel predictor to get a rich feature map. In addition, our model considered the progress of CPNet, which also increased the weight of our assisting loss. At the same time, the composite backbone architecture retained the conventional training mode of the backbone in essence. Decoupling the composite backbone and then pre-training the weight of the individual backbones independently was low-cost.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Segmentation model robust experiment</title>
<p>To make the comparison concrete, various images from the test set of the cotton leaf dataset were selected to visualize the results of the pre-trained segmentation models, and the types of cotton leaf images were described in Section 2.1.1. The comparison results are shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. The segmentation models effectively detect normal and diseased cotton leaves (spotted and regional lesions), especially in detecting cotton leaf edges. The texture features and shape parameters of the cotton leaves during training were simple to learn. Under the condition of shadow occlusion, the overall segmentation of our model and DeepLab v3+ was satisfactory. At the same time, CPNet had the under-segmentation phenomenon, DANet and PSPNet had the over-segmentation and under-segmentation phenomenon. DeepLab v3+, CPNet, DANet, and PSPNet over-segmented cotton leaves compared with the segmentation acceptable to our model under uneven illumination conditions.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Pre-trained segmentation models results on five types of cotton leaf images.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g008.tif"/>
</fig>
<p>The PSPNet, with ResNet-101 as the backbone, incorporated multi-scale features. The segmentation of normal and diseased cotton leaves (spotted and regional lesions) was consistent with the further determination of cotton phenotypic traits. DANet integrated with the attention mechanism, similar to PSPNet, and both had under-segmentation under the condition of shadow occlusion and uneven illumination. CPNet and DeepLab v3+, in turn, due to the backbone update and the introduction of assistant losses, the overall segmentation level was moderately acceptable except for under-segmentation in shadows and over-segmentation in uneven illumination. Since the conventional segmentation models only contained the convolution module and lacked the global receptive field, the conventional segmentation models could not learn the subtle differences between pixels. The processing effects of leaf edges were poor in the complex filed environment.</p>
<p>In contrast, our model based on DeepLab v3+ accurately segmented cotton leaves in typical scenes, especially the edge of cotton leaves. Due to the proper coordination of convolution and self-attention module of assisting backbone CoAtNet and the penalty of assisting loss, our composite model could effectively learn the local and global context of complex background. The excellent performance of our model cannot be achieved without the self-attention module in the assisting backbone. In addition, our model inherited the idea of the various benchmark models to ensure that the encoder had full access to the information from the multi-layer features.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Ablation experiment</title>
<p>The assistant supervision in our model ensured that the assisting backbone contributed to the segmentation. Therefore, the penalty of assisting loss enables the model to learn more cotton leaf features, as CPNet achieved satisfactory improvement by only considering assistant loss. In addition, to fairly compare the progress of our model with DeepLab v3+, the results of decoupled assistant supervision are shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>. <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref> shows the improvement effect of assistant supervision in training from scratch and fine-tuning training strategies. In the training-from-scratch strategy, MIoU and BIoU improved from 0.915 to 0.924, and 0.553 to 0.583, respectively. Accordingly, in the fine-tuning training strategy, MIoU and BIoU improved from 0.929, and 0.585 to 0.940 and 0.608, respectively.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>The effects of assistant supervision: <bold>(A, C)</bold> represent the changing trends of MIoU and BIoU in training from scratch, <bold>(B, D)</bold> are the changing trends of MIoU and BIoU in fine-tuning training.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1111175-g009.tif"/>
</fig>
<p>In training from scratch and fine-tuning training strategies, the trajectory occasionally shows sudden declines. One of the reasons for the decline phenomenon may be the random loading of batch samples in the training data set to train our model. The randomness of training samples led to significant fluctuations in the parameters of our model, which further affected the performance of our model on the test dataset. Besides, to prevent the model from overfitting, two dropout layers were added before the upsampling layer of the decoder. Although the dropout layers can improve the robustness of our model, the dropout layers cause important neurons to be randomly deactivated, which would be the reason for the sudden declines of the trajectory. However, the introduction of assisted supervision promoted the segmentation power of our model, and the training was smoother than that of the non-assisted supervision strategy. The trajectory can recover and rise in fewer epochs after a sudden decline with assisted supervision. The segmentation effect of our model was suboptimal without adopting the assisted supervision strategy. Generally, the attention mechanism of Transformers integrated into the composite backbone of our model achieved remarkable results. Due to the limitation of computing resources, the computational requirements of the Transformer cannot be met. Further, in the ablation experiment, the assisting backbone is replaced by the Transformer for comparison with our model. However, our model incorporated the attention mechanism for the broad success of Transformers, which provides a feasible strategy for overcoming the computational power requirements of Transformers and applying Transformers elegantly to agricultural tasks.</p>
</sec>
</sec>
<sec id="s4" sec-type="conclusion">
<label>4</label>
<title>Conclusion</title>
<p>In this work, from five typical cotton leaves (normal, spotted lesions, regional lesions, occluded blades, uneven illumination), a total of 800 images were labeled at the budding, flowering, and bolling stages. The composite backbone-based encoder and decoder semantic segmentation architecture (our model) was used for cotton leaf segmentation in complex field environments. The composite backbone consisted of the lead backbone Xception and the assisting backbone CoAtNet, saving the computational cost of architecture search for cotton-leaf segmentation. Xception represented the biased learning and generalization of CNN, CoAtNet was integrated into our model with the global context inherited from Transformers. Due to the slight computational power and data requirements of CoAtNet compared with Transformers, our model not only maintained the fast convergence of convolution but also maintained the global receptive field of attention under the constraint of a certain computational cost. At the same time, the introduction of the multi-scale feature fusion mechanism and assistant supervision strategy effectively improved the performance of our model. The experimental results showed that the cotton leaf segmentation performance of our model, especially under complex filed environments, was significantly better than that of the PSPNet, DANet, CPNet and DeepLab v3+ benchmark models, and the under-segmentation and over-segmentation of five typical cotton leaves were encouraging. In addition, different backbones can be trained offline and reassembled into composite backbones with limited computing resources. In the future, more types and numbers of pre-trained backbones can be combined to achieve faster and better plant high-throughput phenotypic tasks.</p>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>JY, TY, PG, and WX contributed to conception and design of the study. JY, TY, and WY contributed to the preparation of equipment and the acquisition of data. JY and TY wrote the code and tested the method. JY, TY, WY, PG, and WX validated the results. JY wrote the first draft of the manuscript. TY, PG, WX, and XL wrote sections of the manuscript. All authors contributed to manuscript revision, read, and approved the submitted version.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>This work has been partially supported by the National Natural Science Foundation of China (grant numbers, 61965014 and 62265015), and the Postgraduate Scientific Research and Innovation Project of Xinjiang Uygur Autonomous Region (grant number, XJ2022G107).</p>
</sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Cui</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Phage_UniR_LGBM: Phage virion proteins classification with UniRep features and LightGBM model</article-title>. <source>Comput. Math. Methods Med.</source> <volume>2022</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1155/2022/9470683</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>2-hydr_Ensemble: Lysine 2-hydroxyisobutyrylation identification with ensemble method</article-title>. <source>Chemometrics Intelligent Lab. Systems.</source> <volume>215</volume>, <elocation-id>104351</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.chemolab.2021.104351</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barbedo</surname> <given-names>J. G. A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Factors influencing the use of deep learning for plant disease recognition</article-title>. <source>Biosyst. Engineering.</source> <volume>172</volume>, <fpage>84</fpage>&#x2013;<lpage>91</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.BIOSYSTEMSENG.2018.05.013</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R. B.</given-names>
</name>
<name>
<surname>Doll'ar</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Berg</surname> <given-names>A. C.</given-names>
</name>
<name>
<surname>Kirillov</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Boundary IoU: Improving object-centric image segmentation evaluation</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>Computer Vision Foundation / IEEE</publisher-name>). <fpage>15329</fpage>&#x2013;<lpage>15337</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01508</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
<name>
<surname>Papandreou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Kokkinos</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Murphy</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Yuille</surname> <given-names>A. L.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs</article-title>. <source>IEEE Trans. Pattern Anal. Mach. intelligence.</source> <volume>40</volume>, <fpage>834</fpage>&#x2013;<lpage>848</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2017.2699184</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Su</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Mi</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Segmentation of field grape bunches <italic>via</italic> an improved pyramid scene parsing network</article-title>. <source>Int. J. Agric. Biol. Engineering.</source> <volume>14</volume> (<issue>6</issue>), <fpage>185</fpage>&#x2013;<lpage>194</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.25165/j.ijabe.20211406.6903</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Papandreou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Schroff</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Encoder-decoder with atrous separable convolution for semantic image segmentation</article-title>. <source>Eur. Conf. Comput. Vision</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <volume>11211</volume>:<fpage>801</fpage>&#x2013;<lpage>818</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_49</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Le</surname> <given-names>Q. V.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>CoAtNet: Marrying convolution and attention for all data sizes</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>34</volume>, <fpage>3965</fpage>&#x2013;<lpage>3977</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2106.04803</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dong</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Mo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Automatic brain tumor detection and segmentation using U-net based fully convolutional networks</article-title>,&#x201d; in <conf-name>annual conference on medical image understanding and analysis</conf-name>, (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>). <fpage>506</fpage>&#x2013;<lpage>517</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-60964-5_44</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Weissenborn</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations</conf-name>, (<publisher-loc>Ithaca, NY</publisher-loc>: <publisher-name>OpenReview.net.</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.-C.</given-names>
</name>
<name>
<surname>Zha</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Exploring new backbone and attention module for semantic segmentation in Street scenes</article-title>. <source>IEEE Access.</source> <volume>6</volume>, <fpage>71566</fpage>&#x2013;<lpage>71580</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2018.2880877</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Chi</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>H.-Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Cotton cultivation technology with Chinese characteristics has driven the 70-year development of cotton production in China</article-title>. <source>J. Integr. Agriculture.</source> <volume>21</volume> (<issue>3</issue>), <fpage>597</fpage>&#x2013;<lpage>609</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S2095-3119(20)63457-8</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Fu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Dual attention network for scene segmentation</article-title>,&#x201d; in <conf-name>2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>Computer Vision Foundation / IEEE</publisher-name>). <fpage>3141</fpage>&#x2013;<lpage>3149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00326</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Loy</surname> <given-names>C. C.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Learning deep representation for imbalanced classification</article-title>,&#x201d; in <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, (<publisher-loc>Los Alamitos, CA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>). <fpage>5375</fpage>&#x2013;<lpage>5384</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.580</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hughes</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Salath&#xe9;</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>An open access repository of images on plant health to enable the development of mobile disease diagnostics through machine learning and crowdsourcing</article-title>,&#x201d; in <source>arXiv</source>. Available at: <uri xlink:href="https://arxiv.org/abs/1511.08060">https://arxiv.org/abs/1511.08060</uri>.</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Semantic segmentation model of cotton roots in-situ image based on attention mechanism</article-title>. <source>Comput. Electron. Agric.</source> <volume>189</volume>, <elocation-id>106370</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106370</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Naseer</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Hayat</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zamir</surname> <given-names>S. W.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>F. S.</given-names>
</name>
<name>
<surname>Shah</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Transformers in vision: A survey</article-title>. <source>ACM computing surveys (CSUR).</source> <volume>54</volume>, <fpage>1</fpage>&#x2013;<lpage>41</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3505244</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kingma</surname> <given-names>D. P.</given-names>
</name>
<name>
<surname>Ba</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Adam: A method for stochastic optimization</article-title>,&#x201d; in <conf-name>Anon. InternationalConferenceon Learning Representations</conf-name> <source>3rd International Conference on Learning Representations, ICLR 2015</source>, <conf-loc> San Diego, CA, USA</conf-loc> (<publisher-loc>Ithaca, NY</publisher-loc>: <publisher-name>OpenReview.net</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1412.6980</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>CBNetV2: A composite backbone network architecture for object detection</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>Computer Vision Foundation / IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2107.00420</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>a). &#x201c;<article-title>Plant disease and insect pest identification based on vision transformer</article-title>,&#x201d; in <conf-name>International Conference on Internet of Things and Machine Learning (IoTML 2021)</conf-name>, (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>ACM</publisher-name>). <fpage>194</fpage>&#x2013;<lpage>201</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1117/12.2628467</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T.-Y.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Hariharan</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Belongie</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, (<publisher-loc>Los Alamitos,CA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>). <fpage>2117</fpage>&#x2013;<lpage>2125</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2017.106</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chai</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A CNN-transformer network with multiscale context aggregation for fine-grained cropland change detection</article-title>. <source>IEEE J. Selected Topics Appl. Earth Observations Remote Sensing.</source> <volume>15</volume>, <fpage>4297</fpage>&#x2013;<lpage>4306</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/jstars.2022.3177235</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Lehman</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Molino</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Such</surname> <given-names>F. P.</given-names>
</name>
<name>
<surname>Frank</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Sergeev</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>An intriguing failing of convolutional neural networks and the CoordConv solution</article-title>. <source>Adv. Neural Inf. Process. systems.</source> <volume>2018</volume>, <fpage>31</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1807.03247</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). &#x201c;<article-title>Cbnet: A novel composite backbone network architecture for object detection</article-title>,&#x201d; in <conf-name>Proceedings of the AAAI conference on artificial intelligence</conf-name>, (<publisher-loc>Menlo Park</publisher-loc>: <publisher-name>AAAI Press</publisher-name>). <fpage>11653</fpage>&#x2013;<lpage>11660</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1609/aaai.v34i07.6834</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>b). <article-title>Attention-optimized DeepLab V3&#x2009;+&#x2009;for automatic estimation of cucumber disease severity</article-title>. <source>Plant Methods</source> <volume>18</volume>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13007-022-00941-8</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Bao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>ACT-SVM: Prediction of protein-protein interactions based on support vector basis model</article-title>. <source>Sci. Programming.</source> <volume>2020</volume>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1155/2020/8866557</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Rodene</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Schnable</surname> <given-names>J. C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Semantic segmentation of sorghum using hyperspectral data identifies genetic associations</article-title>. <source>Plant Phenomics</source> <volume>2020</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.34133/2020/4216373</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Minaee</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Boykov</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Porikli</surname> <given-names>F. M.</given-names>
</name>
<name>
<surname>Plaza</surname> <given-names>A. J.</given-names>
</name>
<name>
<surname>Kehtarnavaz</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Terzopoulos</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Image segmentation using deep learning: A survey</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intelligence.</source> <volume>44</volume>, <fpage>3523</fpage>&#x2013;<lpage>3542</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2021.3059968</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mubarik</surname> <given-names>M. S.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Majeed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Azhar</surname> <given-names>M. T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Revamping of cotton breeding programs for efficient use of genetic resources under changing climate</article-title>. <source>Agronomy.</source> <volume>10</volume>, <elocation-id>1190</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy10081190</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reedha</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Dericquebourg</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Canals</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Hafiane</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Transformer neural network for weed and crop classification of high resolution UAV images</article-title>. <source>Remote. Sens.</source> <volume>14</volume>, <elocation-id>592</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs14030592</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saeed</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>M. A.</given-names>
</name>
<name>
<surname>Sharif</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Mittal</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Goyal</surname> <given-names>L. M.</given-names>
</name>
<name>
<surname>Roy</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Deep neural network features fusion and selection based on PLS regression with an application for crops diseases classification</article-title>. <source>Appl. Soft Comput.</source> <volume>103</volume>, <elocation-id>107164</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.asoc.2021.107164</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhmoginov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Mobilenetv2: Inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>Computer Vision Foundation / IEEE</publisher-name>). <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.34133/2022/9787643</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shelhamer</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Long</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Darrell</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Fully convolutional networks for semantic segmentation</article-title>,&#x201d; in <source>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Los Alamitos,CA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>). <fpage>3431</fpage>&#x2013;<lpage>3440</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2015.7298965</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taghanaki</surname> <given-names>S. A.</given-names>
</name>
<name>
<surname>Abhishek</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Cohen</surname> <given-names>J. P.</given-names>
</name>
<name>
<surname>Cohen-Adad</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Hamarneh</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep semantic segmentation of natural and medical images: A review</article-title>. <source>Artif. Intell. Review.</source> <volume>54</volume>, <fpage>137</fpage>&#x2013;<lpage>178</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-020-09854-1</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Torralba</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Russell</surname> <given-names>B. C.</given-names>
</name>
<name>
<surname>Yuen</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>LabelMe: Online image annotation and applications</article-title>. <source>Proc. IEEE.</source> <volume>98</volume>, <fpage>1467</fpage>&#x2013;<lpage>1484</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JPROC.2010.2050290</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>CCTNet: Coupled CNN and transformer network for crop segmentation of remote sensing images</article-title>. <source>Remote. Sens.</source> <volume>14</volume>, <elocation-id>1956</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs14091956</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Woo</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Park</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J.-Y.</given-names>
</name>
<name>
<surname>Kweon</surname> <given-names>I. S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Cbam: Convolutional block attention module</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision (ECCV)</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>). <fpage>3</fpage>&#x2013;<lpage>19</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_1</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Multi-granularity feature extraction based on vision transformer for tomato leaf disease recognition</article-title>,&#x201d; in <source>2021 3rd International Academic Exchange Conference on Science and Technology Innovation (IAECST)</source> (<publisher-loc>Guangzhou</publisher-loc>). <fpage>387</fpage>&#x2013;<lpage>390</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/iaecst54258.2021.9695688</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Leaf segmentation and classification with a complicated background using deep learning</article-title>. <source>Agronomy.</source> <volume>10</volume>, <elocation-id>1721</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy10111721</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yan</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Apple leaf diseases recognition based on an improved convolutional neural network</article-title>. <source>Sensors (Basel Switzerland).</source> <volume>20</volume>, <elocation-id>3535</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s20123535</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ye</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Cotton breeding research progress in China</article-title>. <source>New Biotechnol.</source> <volume>31</volume>, <fpage>168</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.NBT.2014.05.2038</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yuan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>An improved DeepLab v3+ deep learning network applied to the segmentation of grape leaf black rot spots</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.795410</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Sang</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Context prior for scene segmentation</article-title>,&#x201d; in <source>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>Computer Vision Foundation / IEEE</publisher-name>). <fpage>12413</fpage>&#x2013;<lpage>12422</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/cvpr42600.2020.01243</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Perennial cotton ratoon cultivation: A sustainable method for cotton production and breeding</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.882610</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Pyramid scene parsing network</article-title>,&#x201d; in <source>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source>, (<publisher-loc>Los Alamitos,CA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>). <fpage>6230</fpage>&#x2013;<lpage>6239</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2017.660</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zoph</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Le</surname> <given-names>Q. V.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Neural architecture search with reinforcement learning</article-title>,&#x201d; in <source>International Conference on Learning Representations (ICLR) 2017</source> (<publisher-loc>Ithaca, NY</publisher-loc>: <publisher-name>OpenReview.net</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1611.01578</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>
