<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2023.1091600</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Class-attention-based lesion proposal convolutional neural network for strawberry diseases identification</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Xiaobo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1836552"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Rujing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1478151"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Du</surname>
<given-names>Jianming</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2050779"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Yimin</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2089687"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jiao</surname>
<given-names>Lin</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1661590"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xu</surname>
<given-names>Taosheng</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Science Island Branch, University of Science and Technology of China</institution>, <addr-line>Hefei, Anhui</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Institute of Intelligent Machines, Hefei Institutes of Physical Science, Chinese Academy of Sciences (CAS)</institution>, <addr-line>Hefei, Anhui</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Institute of Physical Science and Information Technology, Anhui University</institution>, <addr-line>Hefei, Anhui</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>School of Internet, Anhui University</institution>, <addr-line>Hefei, Anhui</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Baohua Zhang, Nanjing Agricultural University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Sashuang Sun, Zhejiang University, China; Chu Zhang, Huzhou University, China; Changmiao Wang, The Chinese University of Hong Kong, Shenzhen, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Rujing Wang, <email xlink:href="mailto:rjwang@iim.ac.cn">rjwang@iim.ac.cn</email>; Taosheng Xu, <email xlink:href="mailto:taosheng.x@gmail.com">taosheng.x@gmail.com</email>
</p>
</fn>
<fn fn-type="other" id="fn002">
<p>This article was submitted to Technical Advances in Plant Science, a section of the journal Frontiers in Plant Science</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>26</day>
<month>01</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1091600</elocation-id>
<history>
<date date-type="received">
<day>07</day>
<month>11</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>01</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Hu, Wang, Du, Hu, Jiao and Xu</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Hu, Wang, Du, Hu, Jiao and Xu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Diseases have a great impact on the quality and yield of strawberries, an accurate and timely field disease identification method is urgently needed. However, identifying diseases of strawberries in field is challenging due to the complex background interference and subtle inter-class differences. A feasible method to address the challenges is to segment strawberry lesions from the background and learn fine-grained features of the lesions. Following this idea, we present a novel Class-Attention-based Lesion Proposal Convolutional Neural Network (CALP-CNN), which utilizes a class response map to locate the main lesion object and propose discriminative lesion details. Specifically, the CALP-CNN firstly locates the main lesion object from the complex background through a class object location module (COLM) and then applies a lesion part proposal module (LPPM) to propose the discriminative lesion details. With a cascade architecture, the CALP-CNN can simultaneously address the interference from the complex background and the misclassification of similar diseases. A series of experiments on a self-built dataset of field strawberry diseases is conducted to testify the effectiveness of the proposed CALP-CNN. The classification results of the CALP-CNN are 92.56%, 92.55%, 91.80% and 91.96% on the metrics of accuracy, precision, recall and F1-score, respectively. Compared with six state-of-the-art attention-based fine-grained image recognition methods, the CALP-CNN achieves 6.52% higher (on F1-score) than the sub-optimal baseline MMAL-Net, suggesting that the proposed methods are effective in identifying strawberry diseases in the field.</p>
</abstract>
<kwd-group>
<kwd>convolutional neural network</kwd>
<kwd>strawberry disease identification</kwd>
<kwd>complex background</kwd>
<kwd>similar diseases</kwd>
<kwd>class response map</kwd>
<kwd>main lesion object</kwd>
<kwd>lesion details</kwd>
</kwd-group>
<counts>
<fig-count count="8"/>
<table-count count="6"/>
<equation-count count="20"/>
<ref-count count="45"/>
<page-count count="14"/>
<word-count count="7836"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Strawberry, often praised as the &#x201c;Queen of Fruits&#x201d;, is rich in vitamin C and antioxidants that support heart health and blood sugar control (<xref ref-type="bibr" rid="B10">Hannum, 2004</xref>). It is becoming a new income-producing agricultural product compared with traditional crops. However, strawberries are very delicate and highly susceptible to infection in natural environment. They are prone to various infectious diseases caused by fungal, bacterial and viral pathogens (<xref ref-type="bibr" rid="B14">Iqbal et&#xa0;al., 2021</xref>). Up to now, many strawberry diseases have been identified during the whole cultivation period of strawberries. These diseases can occur in strawberries&#x2019; fruit, leaf, and stem, such as gray mold, powdery mildew and anthracnose. Therefore, disease management is a routine and labor-intensive requirement in strawberry cultivation. Currently, the identification of strawberry diseases is empirically conducted by growers, especially in China. The various types of diseases pose a great challenge to the accurate identification of the growers. Meanwhile, the manual manners are expensive, laborious and subjective, making them hard to wildly apply in modern agriculture. Hence, the current strawberry disease management cannot meet the need for automatic monitoring in agricultural practice (<xref ref-type="bibr" rid="B13">Hu et&#xa0;al., 2021</xref>). Furthermore, most strawberry growers lack professional knowledge to distinguish the diseases, resulting in the use of incorrect and overdose fungicides in disease management. The abuse of fungicides greatly harms the health of consumers and has caused substantial economic loss (<xref ref-type="bibr" rid="B35">Wang et&#xa0;al., 2021b</xref>). There is an urgent need for a fast and effective method to identify diseases in strawberry farming.</p>
<p>In general, the visual symptoms, including color, texture, shape and location of the lesions are important evidence for disease identification (<xref ref-type="bibr" rid="B30">Sankaran et&#xa0;al., 2010</xref>; <xref ref-type="bibr" rid="B4">Cruz et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B20">Liang et&#xa0;al., 2019</xref>). Given these visual features, various methods based on computer vision (CV) technology have been developed to identify different crop diseases. The CV-based methods for crop disease identification can be summarized into two streams. In the first stream, the traditional CV-based methods (such as color space transform, histogram of oriented gradient and gray level co-occurrence matrix [GLCM]) are applied to extract lesion features from diseased spots (<xref ref-type="bibr" rid="B18">Kim et&#xa0;al., 2009</xref>; <xref ref-type="bibr" rid="B28">Revathi and Hemalatha, 2014</xref>; <xref ref-type="bibr" rid="B17">Kaur et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B16">Johannes et&#xa0;al., 2017</xref>). Then, a classifier (e.g., linear/logistic regression, random forest) is constructed to yield classification results based on the extracted features (<xref ref-type="bibr" rid="B12">Huang, 2007</xref>; <xref ref-type="bibr" rid="B17">Kaur et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B15">Iqbal et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B7">Dwivedi et&#xa0;al., 2021</xref>). For instance, three phalaenopsis seedlings diseases had been successfully identified by an artificial neural network with the GLCM extracted texture features (<xref ref-type="bibr" rid="B12">Huang, 2007</xref>). Besides, (<xref ref-type="bibr" rid="B16">Johannes et&#xa0;al., 2017</xref>) designed two descriptors of their segmented hot-spot blobs to validate the effectiveness of the related traditional CV-based methods in identifying diseases at the early stage under a complex field background. The two descriptors were used to describe the color and texture features of the blob lab channels, respectively. These studies have proved that traditional CV-based methods are effective in recognizing the diseases of crops in both laboratory and field environments. However, these methods rely on the manual selection of discriminative features among diseases. The discriminative feature selection in field disease identification is very difficult and time-consuming (<xref ref-type="bibr" rid="B42">Zhao et&#xa0;al., 2022</xref>). Furthermore, the identification accuracy could dramatically decrease with a slight change in the input image dataset (<xref ref-type="bibr" rid="B1">Arsenovic et&#xa0;al., 2019</xref>). These shortcomings result in the traditional CV-based methods rarely adopted in the practice of crop disease identification. The convolutional neural network (CNN) and its variants lead the second stream for crop disease identification. The CNN-based models can automatically extract basic features like color, texture, edge, and location information. Meanwhile, they are competent to obtain more abstract semantic information from the image of crop diseases (<xref ref-type="bibr" rid="B40">Zeiler and Fergus, 2014</xref>). Besides, these CNN-based models have more flexible architectures that can be applied as feature extractors or classifiers. In recent studies, the CNN-based models have become the preferred method to identify crop diseases (<xref ref-type="bibr" rid="B20">Liang et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B13">Hu et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B39">Yang et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B42">Zhao et&#xa0;al., 2022</xref>). Earlier studies apply the classical CNN models, such as AlexNet (<xref ref-type="bibr" rid="B19">Krizhevsky et&#xa0;al., 2012</xref>), GoogLeNet (<xref ref-type="bibr" rid="B33">Szegedy et&#xa0;al., 2015</xref>), and ResNet (<xref ref-type="bibr" rid="B11">He et&#xa0;al., 2016</xref>) on some specific crop disease datasets and found the most suitable model for the disease identification tasks (<xref ref-type="bibr" rid="B22">Mohanty et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B32">Srdjan. et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B8">Ferentinos, 2018</xref>; <xref ref-type="bibr" rid="B34">Too et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B25">Picon et&#xa0;al., 2019</xref>). The related models achieve preferable recognition accuracy on their disease datasets. However, these studies fail to consider the complexity of the practical application of field disease identification. The main challenges of field disease identification are the complex background and a variety of diseases with similar symptoms (<xref ref-type="bibr" rid="B2">Barbedo, 2018</xref>). These models cannot be applied to crop cultivation practice. Consequently, some researches aim at reducing the misclassification caused by complex backgrounds and diseases with similar symptoms.</p>
<p>A simple yet effective method to eliminate the influence of complex background on disease identification is to segment the lesion region from their background. Several CNN-based semantic segmentation methods have been proposed to mitigate the adverse impact of the background. (<xref ref-type="bibr" rid="B23">Ngugi et&#xa0;al., 2020</xref>) proposed a segmentation network, KijianiNet, to segment tomato leaves from the natural field conditions. (<xref ref-type="bibr" rid="B13">Hu et&#xa0;al., 2021</xref>) and (<xref ref-type="bibr" rid="B36">Wang et&#xa0;al., 2021a</xref>) adopted U-Net (<xref ref-type="bibr" rid="B29">Ronneberger et&#xa0;al., 2015</xref>) and DeepLabV3+ (<xref ref-type="bibr" rid="B3">Chen et&#xa0;al., 2018</xref>) in the first stage of their models to segment the diseased leaves from the field scenes, respectively. The related experimental results showed that extracting diseased regions from the background can greatly improve the identification performance of the models. However, CNN-based semantic segmentation methods require pixel-level supervision. Such pixel-level annotation by experts is time-consuming, laborious and costly since plenty of lesions have varied shapes. On the topic of similar disease identification, few studies have proposed effective approaches to tackle this issue. (<xref ref-type="bibr" rid="B4">Cruz et&#xa0;al., 2019</xref>) applied transfer learning and data augmentation technologies to enhance the ability of the classical CNN models (e.g., AlexNet, GoogLeNet and ResNet) to distinguish the grapevine yellow from its similar diseases (such as grapevine leafroll and stictocephala bisonia). The experimental results confirmed that the data augmentation technologies were beneficial for classical CNN models to identify grape diseases. Because a suitable data augmentation strategy could increase the differences among similar diseases. However, the strategy was not easy to obtain, it required trial and error. The research of (<xref ref-type="bibr" rid="B39">Yang et&#xa0;al., 2022</xref>) was a development in identifying similar diseases of field crops. Similar diseases were classified by increasing the weight of discriminative lesion features. To locate lesion details and learn discriminative lesion features among similar diseases, they proposed a self-supervised multi-network fusion classification model. However, the locations of the lesion details were randomly generated. Furthermore, all the obtained lesion details need to be fed to a classifier to assess the confidence of these regions as lesions, which greatly increased the time consumption of the model.</p>    <p>Image-based automatic disease identification is a basic need of modern large-scale cultivation agriculture. Field disease identification is challenging due to the complex background and similar symptoms among diseases. To address these problems, this paper focuses on strawberry field disease identification and proposes a novel Class-Attention-based Lesion Proposal Convolutional Neural Network (CALP-CNN) to precisely identify strawberry diseases in the field. The CALP-CNN method first utilizes a class-attention mechanism to enhance the localization of discriminative lesion feature. Two specific modules (i.e., the class object location module, COLM, and the lesion part proposal module, LPPM) are designed to recursively segment the main lesion object and lesion detail from an input image. Finally, the features of the original, main lesion object and lesion details are concatenated for final identification. To our knowledge, the CALP-CNN method is the first attempt to simultaneously address the challenges posed by the complex background and similar symptoms to crop disease identification in the field. The main contributions are summarized as follows:</p>
<list list-type="bullet">
<list-item>
<p>We introduce a new class attention mechanism (i.e., the class response map) to improve the ability of the CNN to localize the discriminative lesion features.</p>
</list-item>
<list-item>
<p>We address the challenges of field disease identification by developing a novel CALP-CNN that simultaneously removes the noisy background and effectively learns discriminative lesion representations among similar diseases in an unsupervised way.</p>
</list-item>
<list-item>
<p>A series of experiments are conducted on the field strawberry disease dataset to evaluate the effectiveness of the CALP-CNN. The experimental results show that the proposed method has better performance than other state-of-the-art fine-grained methods on field strawberry disease identification.</p>
</list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Material and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Material</title>
<p>In this paper, the strawberry diseases with high incidence in planting practice were taken as our research objects. To this end, a strawberry common disease dataset (SCDD) was constructed. The SCDD was collected in two ways: field-collection and internet crawling. We firstly shot 1,326 disease images of three strawberry varieties (Fengxiang, Nvfeng and Hongyan) in ChangFeng County, Anhui Province, China. To increase the diversity of the dataset, the images were deliberately captured in the field at different angles and focal lengths. The second part was from the internet. A crawler was applied to download more than 5,000 images of field strawberry diseases. The collected images were manually screened one by one to discard the poor-quality samples (obscure and the resolution less than 224&#xd7;224). The disease images in the dataset were annotated by three experts. One was responsible for labeling the dataset, and the other two were responsible for reviewing the results. Finally, a high-quality dataset of strawberry diseases with 3,411 images was constructed for downstream analysis. The SCDD contained 11 common diseases and healthy type. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> shows detailed information of the SCDD. In addition, the typical symptoms of 11 common strawberry diseases are shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>List of strawberry common disease dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Category label</th>
<th valign="top" align="center">Strawberry disease</th>
<th valign="top" align="center">Number</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">0</td>
<td valign="top" align="center">healthy</td>
<td valign="top" align="center">509</td>
</tr>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="center">leaf scorch</td>
<td valign="top" align="center">287</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="center">gray mold</td>
<td valign="top" align="center">332</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="center">powdery mildew</td>
<td valign="top" align="center">344</td>
</tr>
<tr>
<td valign="top" align="left">4</td>
<td valign="top" align="center">brown spot</td>
<td valign="top" align="center">215</td>
</tr>
<tr>
<td valign="top" align="left">5</td>
<td valign="top" align="center">fertilizer disorder</td>
<td valign="top" align="center">308</td>
</tr>
<tr>
<td valign="top" align="left">6</td>
<td valign="top" align="center">fusarium wilt</td>
<td valign="top" align="center">145</td>
</tr>
<tr>
<td valign="top" align="left">7</td>
<td valign="top" align="center">white leaf spot</td>
<td valign="top" align="center">259</td>
</tr>
<tr>
<td valign="top" align="left">8</td>
<td valign="top" align="center">calcium deficiency</td>
<td valign="top" align="center">431</td>
</tr>
<tr>
<td valign="top" align="left">9</td>
<td valign="top" align="center">magnesium deficiency</td>
<td valign="top" align="center">197</td>
</tr>
<tr>
<td valign="top" align="left">10</td>
<td valign="top" align="center">anthracnose</td>
<td valign="top" align="center">198</td>
</tr>
<tr>
<td valign="top" align="left">11</td>
<td valign="top" align="center">bacterial leaf spot</td>
<td valign="top" align="center">186</td>
</tr>
<tr>
<td valign="top" align="left">Total</td>
<td valign="top" align="left"/>
<td valign="top" align="center">3411</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The typical symptoms of 11 common strawberry diseases and one healthy type. The annotated labels of the diseases are one-to-one correspondence with <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g001.tif"/>
</fig>
<p>In our experiments, the dataset was randomly divided into a training set, a validation set and a testing set in the ratio of 6:2:2 (2,047 images for training, 682 images for validation, and the remaining 682 images for testing). In the training process, we adopted the online data augmentation strategies to increase the diversity of the dataset and the robustness of the models. Specifically, the processes of Normalize, RandomHorizontalFlip, RandomVerticalFlip, and RandomResizedCrop (crop to 224&#xd7;224) were applied during training.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Methods</title>
<p>In this paper, a class-attention-based lesion proposal CNN is presented to settle the main challenges of CNN-based methods in field disease identification, i.e., the complex background and similar diseases. The framework of CALP-CNN is shown as <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>. A cascade architecture is designed for extracting the region-based features from the input images at three scales including the raw image at coarse-grained level, the main lesion object at medium-grained level and the lesion detail images at fine-grained level. Furthermore, a series of modules are developed to extract class-related features in each layer of the cascade architecture. The detailed information of the CALP-CNN is described as follows:</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The framework of the proposed CALP-CNN. A cascade architecture is designed to construct the lesion details at different scales. A CNN-based backbone is repeatedly used to extract features from the coarse raw image to lesion detail images. The CRM module generates the class response map from the features. The COLM and the LPPM can obtain the coordinates of the lesion object and the lesion details, respectively. All features (the stripes marked with purple, green, and red) are concatenated for final identification. The classification loss <italic>L<sub>cls</sub>
</italic> (cross-entropy loss between ground truth label <italic>Y<sup>*</sup>
</italic> and predict label <italic>Y<sup>r</sup>
</italic>, <italic>Y&#xb0;</italic>, <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msubsup>
<mml:mi>Y</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>,Y<sup>c</sup>) and the pairwise ranking loss <italic>L<sub>rank</sub>
</italic> (the loss between raw probability <italic>p<sup>r</sup>
</italic>, object probability <italic>p&#xb0;</italic>, and lesion probabilities <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) are combined to optimize the network and make it converge.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g002.tif"/>
</fig>
<p>First, a CNN backbone is repeatedly applied to extract region-based features from the input images in three scales. The CNN modules in three scales are given the same parameters. Second, the features are fed forward to three classifiers to predict three probability scores. The computed probability scores represent the prediction confidence of each disease category. Meanwhile, a class response map (CRM) module is constructed to generate a class attention matrix based on the region-based features. Here, the class attention matrix is defined as a class response map in this paper. Third, two different modules (COLM and LPPM) are developed to detect lesion regions based on the corresponding attention matrix from different scales of the input image, respectively. The COLM is used for locating the main lesion object in the image at coarse-grained level, while the LPPM proposes lesion details in the image at medium-grained level. Once an attended region is located, we segment the region and zoom in it to the raw image size. The located regions can be employed to generate a series of highly reliable lesion features. As a whole, the CALP-CNN takes advantage of ensemble learning to integrate the features from three scales for final identification. Moreover, the CALP-CNN combines an intra-scale cross-entropy loss and an inter-scale pairwise ranking loss to ensure rapid convergence.</p>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>Class response map</title>
<p>A series of class activation maps can be generated by the product of CNN feature maps with their corresponding class scores. The studies of (<xref ref-type="bibr" rid="B45">Zhou et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>) have proved that the class-related information in the class activation maps is effective for locating discriminative regions in an image. In this paper, we obtain discriminative information of lesions based on the class activation maps and construct a class response map (also denoted as class attention matrix) to localize the objects of interest. <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref> shows the generation process of a class response map.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The generation process of class response map.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g003.tif"/>
</fig>
<p>First, a pre-trained CNN backbone is applied to extract the feature maps of a 3-channel image <italic>I</italic>&#x2208;<italic>R</italic>
<sup>3&#xd7;<italic>H</italic>&#xd7;<italic>W</italic>
</sup> , where the <italic>H &#xd7; W</italic> is the spatial size of the image. The extracted feature maps are represented as <italic>S</italic>&#x2208;<italic>R</italic>
<sup>
<italic>C</italic>&#xd7;<italic>H</italic>
<sub>
<italic>f</italic>
</sub>
<sup>&#xd7;<italic>W</italic>
<sub>
<italic>f</italic>
</sub>
</sup>
</sup>,where <italic>C</italic> is the channel number and <sup>
<italic>Hf &#xd7; Wf</italic>
</sup> is the spatial size of the feature maps. Second, the feature maps <italic>S</italic> are fed forward to a classifier consisting of a fully connected (FC) layer and a softmax layer. A vector p&#x2208;R^{N_c}.(<italic>N<sub>C</sub>
</italic> is the pre-set category number of the strawberry diseases) can be computed by the classifier as the predicted probability score of each disease. In addition, the weights of the FC layer are denoted as <italic>w</italic>
<sub>
<italic>fc</italic>
</sub>&#x2208;<italic>R</italic>
<sup>
<italic>C</italic>&#xd7;<italic>N</italic>
<sub>
<italic>c</italic>
</sub>
</sup> . Third, a CRM module is designed to generate the class-related features maps. It establishes a new convolutional layer with the weight of the <italic>w<sub>fc</sub>
</italic> (i.e., the formed convolutional layer achieves the same weights as the FC layer). Therefore, it possesses a strong ability to extract class-related features. Based on the constructed convolutional layer, a set of class-related feature maps <italic>Q</italic>={<italic>Q</italic>
<sub>
<italic>i</italic>
</sub>}(<italic>Q</italic>
<sub>
<italic>i</italic>
</sub>&#x2208;<italic>R</italic>
<sup>
<italic>Hf</italic>&#xd7;<italic>Wf</italic>
</sup>,<italic>i</italic>=1,&#x2026;,<italic>N</italic>
<sub>
<italic>c</italic>
</sub>) can be generated from the extracted <italic>S</italic>. The <italic>Q<sub>i</sub>
</italic> represents the <italic>i</italic>-th channel. The features of the <italic>Q<sub>i</sub>
</italic> are most relevant to category <italic>i</italic>. In the training process, the CALP-CNN applies the ground truth label to select the most class-related feature map of the convolutional layer as the class response map. That is to say, if the image is annotated as category <italic>c</italic>, the class response map is <italic>Q<sub>c</sub>
</italic>. In the testing process, there is no ground truth label of the input image. Follow as (<xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>), the CALP-CNN adopts the entropy of the top 5 predicted probabilities to evaluate the lesion information in their corresponding class-related maps. Let <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>5</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>be the subset of <italic>p</italic> for top 5 predicted class probabilities. We compute the entropy as</p>
<disp-formula>
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>5</mml:mn>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mo>&#xb7;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2003;</mml:mtext>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</disp-formula>
<p>and construct the class response map <italic>Q<sub>c</sub>
</italic> based on the following strategy,</p>
<disp-formula>
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>5</mml:mn>
</mml:munderover>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mstyle>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>M</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>is the class-related feature maps correspond to <inline-formula>
<mml:math display="inline" id="im5">
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
</mml:math>
</inline-formula> and &#x3b5; is a threshold (empirically set to 0.2).</p>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Class object location module (COLM)</title>
<p>In most cases, the CNN backbone could extract many irrelevant and noisy features that are adverse to disease identification, especially for a complex background (<xref ref-type="bibr" rid="B2">Barbedo, 2018</xref>). To cope with this issue, we design the COLM to locate the main lesion object and discard the irrelevant background region. This module is inspired by the discriminative region location methods of the fine-grained image classification and retrieval domain (<xref ref-type="bibr" rid="B37">Wei et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B41">Zhang et&#xa0;al., 2021</xref>). The pipeline of COLM is shown as <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>
</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The pipeline of COLM. A class response map is generated from a CRM module. The pixels in the class response map are compared to their average value to generate a mask map. Some non-lesion areas are activated by the complex background in the mask map.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g004.tif"/>
</fig>
<p>The class response map <italic>Q<sub>c</sub>
</italic> is resized to the same size as the input image <italic>I</italic> by a bilinear interpolation algorithm. The interpolation result is denoted as <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:msubsup>
<mml:mi>Q</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Ding et&#xa0;al. have concluded that the larger value in the class response map, the more related of the corresponding pixel to the class (<xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>). In most cases, we have no prior knowledge about the location of the lesion objects since most crop disease datasets only have image-level supervision.</p>
<disp-formula>
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>H</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>W</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>Q</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Then, a mask map <italic>M</italic> can be generated according to Eq.4.</p>
<disp-formula>
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mo>&#x200b;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msubsup>
<mml:mi>Q</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&gt;</mml:mo>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mtext>&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;</mml:mtext>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;&#xa0;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, the object regions are marked red in the mask map. We can observe some noisy regions (the top-left and bottom-right) in the mask. In fact, the noisy regions could be non-lesion parts, whereas they are activated by the complex background. Fortunately, the sizes of the noisy regions are typically smaller than the main lesion object. Flood-fill algorithm is a common method to connect neighboring and related elements of a matrix. In this paper, we apply it to test the connectivity of all the points in <italic>M</italic> and find out the largest connected area. The largest connected area is the location of the main lesion object. The minimum enclosing rectangle of the largest connected area is denoted as <italic>M</italic>. We adopt the top-left point (<italic>x<sub>tl</sub>, y<sub>tl</sub>
</italic>) and bottom-right point (<italic>x<sub>br</sub>, y<sub>br</sub>
</italic>) to represent the location of <italic>M = M</italic> [<italic>x<sub>tl</sub>:x<sub>br</sub>,y<sub>tl</sub>:y<sub>br</sub>
</italic>]. With the interpolation algorithm, the pixels in the mask map <italic>M</italic> are one-to-one corresponding to the pixels in the input image <italic>I</italic>. Therefore, the location of <italic>M</italic> can be used to extract the main lesion object and discard the noisy background in <italic>I</italic>. As a result, the main lesion object <italic>I<sub>obj</sub>
</italic> is computed as:</p>
<disp-formula>
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Based on the ablation experiments in section 4.2, the COLM module can effectively improve the classification accuracy.</p>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Lesion part proposal module (LPPM)</title>
<p>Identifying similar diseases in the field is another critical problem for strawberry cultivation, especially for those diseases which have homologous backgrounds and subtle inter-class differences (e.g., the diseases at the early stage and the diseases occurring in the same part). Strengthening the differences between diseases is the key approach to address this issue (<xref ref-type="bibr" rid="B4">Cruz et&#xa0;al., 2019</xref>). The similar disease identification is in accord with the characteristics of the fine-grained image recognition (FGIR) (<xref ref-type="bibr" rid="B43">Zheng et&#xa0;al., 2017</xref>). The studies of FGIR have concluded that the discriminative features always lie in the details (<xref ref-type="bibr" rid="B9">Fu et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B26">Recasens et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B41">Zhang et&#xa0;al., 2021</xref>). Hence, we present the LPPM to localize the distinguishing lesion features in the details. The design idea of this module is derived from the region proposal algorithm (RPA) (<xref ref-type="bibr" rid="B27">Ren et&#xa0;al., 2015</xref>). The RPA is an effective method to propose candidate regions for object detection. The candidate region is called anchor in object detection. Nevertheless, the RPA requires an additional bounding box to annotate the location of the object. The bounding box annotation process is labor-intensive and subjective. Here, we take the average value of all pixels in the anchor as a confidence of whether the region in the anchor is a lesion detail. In this way, the RPA can be generalized to identify detailed lesions in the images without bounding box annotations.</p>
<p>The pipeline of LPPM is shown as <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>. The LPPM takes the output (i.e., class response map) of a CRM module as input. We denote it as <italic>M</italic>
<sub>
<italic>c</italic>
</sub>&#x2208;<italic>R</italic>
<sup>
<italic>Hf</italic>&#xd7;<italic>Wf</italic>
</sup> . First, the LPPM propose the coordinates of the anchors on <italic>M<sub>c</sub>
</italic>. By default, we use 3 aspect ratios (1:1, 2:1, 1:2) and 1 scale (H<italic>
<sub>f</sub>
</italic>/2), yielding <italic>k</italic>=3 anchors at each pixel of <italic>M<sub>c</sub>
</italic>. The total number of generated anchors is k &#xd7; H<italic>
<sub>f</sub>
</italic>&#xd7; W<italic>
<sub>f</sub>
</italic>. Each anchor is an eligible candidate for the lesion detail. The coordinates of the anchors are denoted by their top-left point <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msubsup>
<mml:mi>(x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>and bottom-right point <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msubsup>
<mml:mi>(x</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>Second, we calculate the average value of an anchor at <italic>M<sub>c</sub>
</italic> as follows:</p>
<disp-formula>
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>a</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>tl</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>br</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mi>tl</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mi>br</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>br</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>tl</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mi>br</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>tl</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>The pipeline of LPPM. First, a class response map is generated from a CRM module. Second, the RPA is applied to proposal candidate lesion regions from the class response map. Third, a non-maximum suppression is utilized to pick out the top-<italic>N</italic> lesions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g005.tif"/>
</fig>
<p>
<inline-formula>
<mml:math display="inline" id="im9">
<mml:mover accent="true">
<mml:mi>a</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:math>
</inline-formula> is the confidence of the anchor to be a lesion detail region. A higher value of &#x101; represents the higher probability of the anchor being a lesion detail. Third, we pick out the top-<italic>N</italic> anchors according to their confidence. In practice, the top-<italic>N</italic> anchors are adjacent and contain almost the same parts (<xref ref-type="bibr" rid="B27">Ren et&#xa0;al., 2015</xref>). For this reason, the directly selection of top-<italic>N</italic> anchors will cause information redundancy.</p>
<table-wrap position="float">
<label>Algorithm 1</label>
<table frame="hsides">
<tbody>
<tr>
<td>
<preformat>
<bold>Input:</bold> The coordinate list of the anchors; The corresponding confidence list of the anchors; The IoU <italic>threshold</italic>, <bold>Output:</bold> The top-<italic>N</italic> anchor listCombined the confidence list and the coordinate list with an element as <named-content content-type="inline-equation">
<inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mover accent="true">
<mml:mi>a</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</named-content>. The result is a confidence_coordinate_list;&#xD;confidence_coordinate_list &#x2190; Sort the combined list in descending order with the confidence <named-content content-type="inline-equation">
<inline-formula>
<mml:math display="inline" id="im11">
<mml:mover accent="true">
<mml:mi>a</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:math>
</inline-formula></named-content> ;anchor_list &#x2190; Initialize an empty list of selected anchors;&#xD;<bold>while</bold> <italic>Length(anchor_list)&lt; N and Length(confidence_coordinate_list)</italic> &gt; <italic>0</italic> <bold>do</bold>&#xD;A&#x2190;Pop out the first anchor element from the confidence_coordinate_list;&#xD;<bold>If</bold> <italic>anchor_list is empty</italic> <bold>then</bold>Add <italic>A</italic> to the anchor_list;<bold>else</bold> Calculate the <italic>IoU</italic> between A and the other anchors in the anchor_list;<bold>if</bold> <italic>IoU&lt; threshold</italic> <bold>then</bold> <italic>IoU&lt; threshold</italic> Add <italic>A</italic> to the anchor_list;<bold>return</bold> <italic>the anchor_list (is the top-N list);</italic>
</preformat></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In this paper, we use the intersection over union (IoU) to indicate the redundant ratio of two anchors. The IoU between anchor <italic>A</italic>
<sub>2</sub> and anchor <italic>A</italic>
<sub>2</sub> is computed as:</p>
<disp-formula>
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2229;</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x222a;</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The IoU ratios between the anchor with the highest confidence and the other anchors are calculated. The scores of neighboring anchors will be suppressed when their IoU ratios are higher than the pre-set threshold. The threshold is set to 0.7 in this paper. The selection process of the top-<italic>N</italic> anchors is described in Algorithm 1 Note that <italic>N</italic> is a hyper-parameter which represents the defined number of lesion details. From the ablation experiments (see Section 4.2), the CALP-CNN achieves the best classification results when <italic>N</italic> is set to 5. Finally, we map the coordinate of the anchors in the top-<italic>N</italic> list to the input image <italic>I</italic> with the stride (s = H/H<sub>
<italic>f</italic>
</sub>) of the backbone network. The location of the lesion <italic>I<sub>detail</sub>
</italic> is generated as:</p>
<disp-formula>
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mtext>detail</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>I</mml:mi>
<mml:mo stretchy="false">[</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>*</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>*</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>*</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>*</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s2_2_4">
<label>2.2.4</label>
<title>Optimization strategy</title>
<p>The loss function of the proposed CALP-CNN is composed of two parts, including an intra-scale cross-entropy loss <italic>L<sub>cls</sub>
</italic> and an inter-scale pairwise ranking loss <italic>L<sub>rank</sub>
</italic>. The total loss function for an image <italic>I</italic> is defined as follows:</p>
<disp-formula>
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The <italic>L<sub>cls</sub>
</italic> and <italic>L<sub>rank</sub>
</italic> are expressed in Eq. 10 and Eq. 11, respectively.</p>
<disp-formula>
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mo>*</mml:mo>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mo>*</mml:mo>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mo>*</mml:mo>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mo>*</mml:mo>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>Y<sup>r</sup>
</italic>, <italic>Y&#xb0;</italic>, and <italic>Y<sup>d</sup>
</italic> are the predicted label vectors from the raw, object and detail images. <italic>Y<sup>c</sup>
</italic> is the predicted label vector using the concatenated features and <italic>Y<sup>*</sup>
</italic> is the ground truth label vector. <italic>N</italic> is the number of lesion details. <italic>L<sub>cls</sub>
</italic> is the chief loss function, which is dominant in the parameter optimization of the CALP-CNN.</p>
<disp-formula>
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>p<sup>r</sup>
</italic>, <italic>p&#xb0;</italic> and <italic>p<sup>d</sup>
</italic> denote the prediction probabilities from the raw, object and detail images, respectively. To be specific, the ranking loss of the probabilities <italic>p<sup>i</sup>
</italic> and <italic>p<sup>j</sup>
</italic> is defined as:</p>
<disp-formula>
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mo>{</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where &#x3b4; is a constant (by default, &#x3b4;=0.05). The ranking loss can force the object image to acquire higher predicted probabilities than the original image. Meanwhile, the detail images are forced to acquire higher predicted probabilities than the object image. In other words, the <italic>L<sub>rank</sub>
</italic> takes a coarse prediction as reference and gradually compels the network toward more discriminative region by forcing the finer-scale images to achieve more confident predictions.</p>
</sec>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Evaluation metrics</title>
<p>In this paper, the <italic>Accuracy</italic>, <italic>Precision</italic>, <italic>Recall</italic>, and <italic>F</italic>1-<italic>score</italic> are adopted to evaluate the performance of the proposed CALP-CNN. The <italic>Accuracy</italic>, <italic>Precision</italic>, <italic>Recall</italic>, and <italic>F</italic>1-<italic>score</italic> of category <italic>i</italic> are defined as follows:</p>
<disp-formula>
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>TP<sub>i</sub>
</italic> and <italic>TN<sub>i</sub>
</italic> denote the number of samples labeled as category <italic>i</italic> and non-category <italic>i</italic> that are correctly classified, respectively. <italic>FP<sub>i</sub>
</italic> denotes the number of samples labeled as non-category <italic>i</italic> but classified as category <italic>i</italic>. <italic>FN<sub>i</sub>
</italic> denotes the number of samples labeled as category <italic>i</italic> but classified as non-category <italic>i</italic>.</p>
<p>For a multi-class classification task, the overall <italic>Accuracy</italic>, <italic>Precision</italic>, <italic>Recall</italic>, and <italic>F</italic>1-<italic>score</italic> can be defined with the average of all the categories in their binary classification case. The formulas of the overall <italic>Accuracy</italic>, <italic>Precision</italic>, <italic>Recall</italic>, and <italic>F</italic>1-<italic>score</italic> are defined as follows:</p>
<disp-formula>
<label>(17)</label>
<mml:math display="block" id="M17">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(18)</label>
<mml:math display="block" id="M18">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(19)</label>
<mml:math display="block" id="M19">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(20)</label>
<mml:math display="block" id="M20">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where the <italic>N<sub>c</sub>
</italic> represents the number of categories of strawberry diseases in the SCDD.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Experimental results and analysis</title>
<p>We conduct a series of experiments on the testing set of the SCDD to verify the effectiveness of the proposed CALP-CNN to identify strawberry diseases by filtering the complex background features and learning the discriminative features among similar diseases. The top-N of the anchors (lesion details) is set to 5 for the LPPM in our experiments.</p>    <p>
<bold>Baselines:</bold> Because the CALP-CNN is an attention-based model and our SCDD only has image-level supervision, here we select six weakly-supervised fine-grained image recognition methods as baselines and compare their disease identification performance with the CALP-CNN method. The six baselines are described in detail as follows:</p>
<list list-type="bullet">
<list-item>
<p>MA-CNN (<xref ref-type="bibr" rid="B43">Zheng et&#xa0;al., 2017</xref>): Multi-attention convolutional neural network, which uses channel grouping to learn different part features.</p>
</list-item>
<list-item>
<p>RA-CNN (<xref ref-type="bibr" rid="B9">Fu et&#xa0;al., 2017</xref>): Recurrent attention convolutional neural network, which recurrent learns the finer-scale features by an attention proposal network.</p>
</list-item>
<list-item>
<p>MMAL-Net (<xref ref-type="bibr" rid="B41">Zhang et&#xa0;al., 2021</xref>): Multi-branch and multi-scale attention network, which utilizes a saliency map to locate the main object and propose discriminative parts.</p>
</list-item>
<list-item>
<p>SSN (<xref ref-type="bibr" rid="B26">Recasens et&#xa0;al., 2018</xref>): A saliency-based sampling layer for a neural network that samples the raw image based on a saliency map with a non-uniform method.</p>
</list-item>
<list-item>
<p>TASN (<xref ref-type="bibr" rid="B44">Zheng et&#xa0;al., 2019</xref>): Trilinear attention sampling network first uses a trilinear function to enhance saliency values, then samples the raw images with these enhanced values.</p>
</list-item>
<list-item>
<p>S3N (<xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>): Selective sparse sampling network, which captures diverse and fine-grained detail from the raw image based on a class response map with a selective sparse method.</p>
</list-item>
</list>
<p>All the baselines achieve state-of-the-art on their fine-grained datasets [e.g., CUB-200-2011 (<xref ref-type="bibr" rid="B38">Welinder et&#xa0;al., 2010</xref>), and FGVC Aircraft (<xref ref-type="bibr" rid="B21">Maji et&#xa0;al., 2013</xref>)].</p>
<p>
<bold>Implementation details:</bold> The proposed CALP-CNN is implemented on the open-source package Pytorch (<xref ref-type="bibr" rid="B24">Paszke et&#xa0;al., 2019</xref>), which can flexibly implement various CNN-based models. A pre-trained ResNet-50 on the ImageNet dataset is used as the backbone for extracting the feature maps. For a fair comparison, all baselines are re-implemented with this backbone. We use the stochastic gradient descent (SGD) to optimize network parameters. All the models are trained for 60 epochs with a batch size of 16. The initial learning rate is set to 1e-3 and will be dropped by 10 at the 20-th and 40-th epoch. The momentum is set to 0.9 and the weight decay is set to 1e-4. The input images are preprocessed to size 224&#xd7;224. All the experiments are performed on a dell T5820 computer workstation with NVIDIA GeForce RTX 3090 GPU and Intel Xeon W-2200 processor.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Classification results</title>
<p>We compare the performance of the proposed CALP-CNN with the baselines on the testing set of the SCDD. The classification results are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. The CALP-CNN achieves more accurate classification results on all metrics. The CALP-CNN significantly outperforms the backbone (ResNet-50) by 9.49% on the <italic>F</italic>1-<italic>score</italic>. The overall <italic>F</italic>1-<italic>score</italic> of the CALP-CNN is higher than the saliency-based models, for example, 9.03% improvement for SSN, 7.05% improvement for TASN, and 6.52% improvement for MMAL-Net. Additionally, the proposed CALP-CNN is also superior to the recurrent attention method (RA-CNN), the channel grouping attention method (MA-CNN), and the class attention method (S3N). Specifically, it improves 8.59%, 8.4% and 6.55% compared with RA-CNN, MA-CNN, and S3N on <italic>F</italic>1-<italic>score</italic>, respectively. Note that the improvement of our proposed model is contributed by the introduction of the COLM and LPPM. The COLM can filter the noisy background features, while the LPPM provides discriminative lesion details.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The classification performance of different methods on the SCDD.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left"/>
<th valign="top" align="center">Attention Mechanism</th>
<th valign="top" align="center">
<italic>F</italic>1-<italic>score</italic>
</th>
<th valign="top" align="center">
<italic>Accuracy</italic>
</th>
<th valign="top" align="center">
<italic>Precision</italic>
</th>
<th valign="top" align="center">
<italic>Recall</italic>
</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ResNet-50 (He et al., 2016)</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">82.47</td>
<td valign="top" align="center">84.35</td>
<td valign="top" align="center">83.49</td>
<td valign="top" align="center">82.13</td>
</tr>
<tr>
<td valign="top" align="left">RA-CNN (Fu et al., 2017)</td>
<td valign="top" align="center">part attention</td>
<td valign="top" align="center">83.37</td>
<td valign="top" align="center">85.71</td>
<td valign="top" align="center">84.56</td>
<td valign="top" align="center">83.38</td>
</tr>
<tr>
<td valign="top" align="left">MA-CNN (Zheng et al., 2017)</td>
<td valign="top" align="center">channel attention</td>
<td valign="top" align="center">83.56</td>
<td valign="top" align="center">85.82</td>
<td valign="top" align="center">84.49</td>
<td valign="top" align="center">83.92</td>
</tr>
<tr>
<td valign="top" align="left">MMAL-Net (Zhang et al., 2021)</td>
<td valign="top" align="center">saliency attention</td>
<td valign="top" align="center">
<bold>85.44</bold>
</td>
<td valign="top" align="center">
<bold>87.11</bold>
</td>
<td valign="top" align="center">85.79</td>
<td valign="top" align="center">
<bold>85.47</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">SSN (Recasens et al., 2018)</td>
<td valign="top" align="center">saliency attention</td>
<td valign="top" align="center">82.93</td>
<td valign="top" align="center">84.40</td>
<td valign="top" align="center">84.01</td>
<td valign="top" align="center">82.91</td>
</tr>
<tr>
<td valign="top" align="left">TASN (Zheng et al., 2019)</td>
<td valign="top" align="center">saliency attention</td>
<td valign="top" align="center">84.91</td>
<td valign="top" align="center">87.10</td>
<td valign="top" align="center">85.72</td>
<td valign="top" align="center">84.88</td>
</tr>
<tr>
<td valign="top" align="left">S3N (Ding et al., 2019)</td>
<td valign="top" align="center">class attention</td>
<td valign="top" align="center">85.41</td>
<td valign="top" align="center">86.70</td>
<td valign="top" align="center">
<bold>86.56</bold>
</td>
<td valign="top" align="center">84.72</td>
</tr>
<tr>
<td valign="top" align="left">CALP-CNN</td>
<td valign="top" align="center">class attention</td>
<td valign="top" align="center">
<bold>91.96</bold>
</td>
<td valign="top" align="center">
<bold>92.56</bold>
</td>
<td valign="top" align="center">
<bold>92.55</bold>
</td>
<td valign="top" align="center">
<bold>91.80</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold and underlined values indicate the highest and sub-optimal scores in the metric, respectively.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Ablation experiments</title>
<p>In this paper, four ablation experiments are conducted to investigate the role of 1) different network branches, 2) lesion location methods (saliency map vs. class response map), 3) the number of lesion details, and 4) the ranking loss on field disease identification accuracy. The experiments show that the CNN with three branches and five lesion details (top-5) achieves the best performance. The best model is equipped with the class response map for lesion location and the ranking loss for model optimization.</p>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Contribution of different branches</title>
<p>As shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>, the CALP-CNN consists of three main branches, i.e., the raw branch (R-branch), the object branch (O-branch), and the (lesion) details branch (D-branch). In our experiments, we temporarily remove different branches to survey the contribution of each branch. The <italic>F</italic>1-<italic>score</italic> of the ablation experiments is recorded in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. The following conclusions can be drawn: 1) The <italic>F</italic>1-<italic>score</italic> of the CALP-CNN with all branches (R+O+D) is 91.96%. It drops to 87.94% when omitting the O-branch. While it drops to 88.42% when the D-branch is removed. These results demonstrate that both the O-branch and the D-branch are capable of locating informative lesion regions. 2) The O-branch has the highest score (88.97%) among the three branches. It shows that the locating and segmenting operation of the class-related lesion object from the complex background can effectively eliminate the influence of the background on disease identification in the field. 3) The D-branch represents detailed information on lesions but does not yield the highest score among the three branches. It demonstrates that the discriminative lesion detail features are not all-inclusive for disease identification. Contextual information is also a key feature for disease identification. On the other hand, the D-branch could provide essential information to the other branches. The overall accuracy of the network features is improved from 83.92% to 87.94% in R+D branches setting and 87.08% to 91.21% in O+D branches setting, respectively. Furthermore, the D-branch can collect important lesion details for similar disease identification cases. 4) Note that the absence of the O-branch results in a bigger loss (4.02%, from 91.96% to 87.94%) than the D-branch (3.54%, from 91.96% to 88.42%), suggesting that removing the background features is critical for disease identification in the field. 5) The concatenated features of the three branches achieved the best performance. It indicates that the share of the object and the lesion detail features can enhance the lesion features and suppress the influence of background features. The disease surrounding context information of disease is preserved in the concatenated features.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>The contribution of each branch.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Experimental Setting</th>
<th valign="top" align="center">R-branch(%)</th>
<th valign="top" align="center">O-branch(%)</th>
<th valign="top" align="center">D-branch(%)</th>
<th valign="top" align="center">Concatenation(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">R branch</td>
<td valign="top" align="center">82.47</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">82.47</td>
</tr>
<tr>
<td valign="top" align="left">R+O branches</td>
<td valign="top" align="center">82.66</td>
<td valign="top" align="center">88.97</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">88.42</td>
</tr>
<tr>
<td valign="top" align="left">R+D branches</td>
<td valign="top" align="center">83.92</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">83.01</td>
<td valign="top" align="center">87.94</td>
</tr>
<tr>
<td valign="top" align="left">O+D branches</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">87.08</td>
<td valign="top" align="center">84.37</td>
<td valign="top" align="center">91.21</td>
</tr>
<tr>
<td valign="top" align="left">R+O+D branches</td>
<td valign="top" align="center">82.44</td>
<td valign="top" align="center">88.12</td>
<td valign="top" align="center">86.05</td>
<td valign="top" align="center">91.96</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Role of different location methods</title>
<p>We re-implement the COLM and LPPM with saliency-based attention (<xref ref-type="bibr" rid="B41">Zhang et&#xa0;al., 2021</xref>) to locate the main object and the lesion details. The saliency map adopts a class-agnostic attention mechanism. Different from the saliency map, the class response map is a class-aware attention method. From <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>, we can observe that the class-aware method has 5.57% higher scores than the class-agnostic method. It further demonstrates that the class-aware method can effectively localize class-related regions.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Comparison between different location methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left"/>
<th valign="top" align="center">
<italic>F</italic>1-<italic>score</italic>(%)</th>
<th valign="top" align="center">Comments</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">saliency map</td>
<td valign="top" align="center">86.39</td>
<td valign="top" align="left">class-agnostic attention</td>
</tr>
<tr>
<td valign="top" align="left">class response map</td>
<td valign="top" align="center">91.96</td>
<td valign="top" align="left">class-aware attention</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Number of lesion details: Ten experiments are performed to investigate the relationship between the classification result (<italic>F</italic>1-<italic>score</italic>) and the number of lesion details. As shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, the <italic>F</italic>1-<italic>score</italic> improves as the number of lesion details increases. However, the <italic>F</italic>1-<italic>score</italic> declines when the number of lesion details exceeds 5. It demonstrates that the disease classification performance is not positive to the number of lesion details. The underlying reason is that the contextual information is diluted in numerous detailed lesions.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Relationship between the classification accuracy (<italic>F</italic>1-<italic>score</italic>) and the number of lesion details.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g006.tif"/>
</fig>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Effect of ranking loss</title>
<p>To explore the impact of the ranking loss on classification results, we remove the ranking loss and only retain the cross-entropy loss to optimize the parameters of the CALP-CNN model. The best <italic>F</italic>1-<italic>score</italic> in 60 epochs training is 91.30%, which is 0.66% lower than the original model. The introduction of ranking loss could assist the two modules (COLM and LPPM) in localizing more discriminative regions.</p>
</sec>
</sec>
<sec id="s3_3" sec-type="results">
<label>3.3</label>
<title>Results of similar diseases identification</title>
<p>In practice, some of the diseases of strawberries perform similar visual appearance and contextual information, which could result in false identification among similar diseases. In order to evaluate the effectiveness of the proposed CALP-CNN for distinguishing these similar diseases, two kinds of similar strawberry diseases are chosen in the SCDD for experiments, including (1) the diseases at early stage, (2) the diseases occurring on fruits (e.g., gray mold, powdery mildew, anthracnose). We generate two sub-datasets corresponding to the two kinds of similar strawberry diseases. The disease samples from the two sub-datasets are shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>The examples of the similar diseases in the SCDD.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g007.tif"/>
</fig>
<p>The validation results of the trained CALP-CNN and the ResNet-50 on the two sub-datasets are recorded in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>. Both of the methods do not achieve the ideal identification performance. However, our CALP-CNN outperforms the ResNet-50 by 5.85% on disease at early stage dataset and 6.73% on disease on fruit dataset, respectively. Overall, the results suggest that the identification of similar strawberry diseases is challenging. While the discriminative lesion detail features provide helpful information to improve the identification performance.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>The performance of the CALP-CNN and the ResNet-50 on the similar disease datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Dataset</th>
<th valign="top" align="center">Amount/Categories</th>
<th valign="top" align="center">ResNet-50</th>
<th valign="top" align="center">CALP-CNN</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">early stage</td>
<td valign="top" align="center">324/10</td>
<td valign="top" align="center">59.87</td>
<td valign="top" align="center">65.72</td>
</tr>
<tr>
<td valign="top" align="left">on fruit</td>
<td valign="top" align="center">79/3</td>
<td valign="top" align="center">69.30</td>
<td valign="top" align="center">76.03</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Qualitative evaluation of lesion localization performance</title>
<p>Because most of the strawberry datasets (including the SCDD) are image-level annotations. It is difficult to quantitatively evaluate the location accuracy of the main lesion object and the lesion details at the pixel-level. Here, we follow the study of (<xref ref-type="bibr" rid="B37">Wei et&#xa0;al., 2017</xref>) to conduct a qualitative evaluation to evaluate the accuracy of the main lesion object and lesion detail detection. We randomly pick out 3 groups of diseased images from the testing set for each strawberry disease and visualize the identification results of the lesions. The experimental results are shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. In <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>, the first column of each group is the input image, and the subsequent two columns are the location results of the main lesion object and lesion details of the image, respectively. Note that the images of lesion detail have been amplified to the same size as their input images. Based on the results of the main lesion objects, we can observe that the main lesion objects are all identified in the predicted bounding boxes of the COLM (group 1: 11/11, group 2: 11/11, group 3: 11/11). Furthermore, the predicted boxes contain contextual information by persevering the local background of the main lesion objects. In addition, most lesion details of the diseases can also be predicted by the LPPM (group 1: 54/55, group 2: 52/55, group 3: 55/55). In our experiments, the false predicted lesion areas occur in the images which have only one lesion area and the size of the lesion is relatively small (e.g., line 7, column 3 of group 2).</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Identified main lesion object and lesion details. For each disease, we randomly select three samples from the testing set. The first column of each sample is the diseased image, and the subsequent two columns are the location results of the main lesion object and lesion details. The labels of the diseases are consistent with the <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1091600-g008.tif"/>
</fig>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussions and conclusions</title>
<p>Existing methods for crop disease identification in the field are not sufficiently accurate because of their poor ability to eliminate the interference from the background and extract discriminative features among similar diseases. Detecting and segmenting the lesion region from the disease image is a simple yet effective way to reduce the influence of the complex background. Meanwhile, learning discriminative features from the lesion details is beneficial for the identification of similar diseases. The CNN-based semantic segmentation methods can effectively segment the lesion regions from the complex background. Hence, recent studies use semantic segmentation networks to segment lesion regions from the background as the first step of their models (<xref ref-type="bibr" rid="B13">Hu et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B36">Wang et&#xa0;al., 2021a</xref>). The segmentation performance of the networks highly relies on the amount of pixel-level annotated data. The pixel-level annotation is time-consuming, laborious and expensive, which restricts the applications of CNN-based segmentation methods. Besides, many studies have shown that the CNNs can localize discriminative regions from the input image (<xref ref-type="bibr" rid="B31">Selvaraju et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B5">Dabkowski and Gal, 2017</xref>; <xref ref-type="bibr" rid="B37">Wei et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>). However, not all the located regions are useful for disease identification. The regions, which are activated by the complicated background, are adverse for disease identification (<xref ref-type="bibr" rid="B2">Barbedo, 2018</xref>). Therefore, it is necessary to filter out the most useful region from the located regions. The identification of similar diseases is also a challenging task. Because the discriminative details between the similar diseases are too subtle to be well-represented by the CNNs. Data augmentation technologies can increase the differences among similar diseases. Nevertheless, the increment is not obvious (<xref ref-type="bibr" rid="B4">Cruz et&#xa0;al., 2019</xref>). In addition, a suitable augmentation strategy is not straightforward and requires trial and error. Hence, data augmentation technologies are not an appropriate solution for similar disease identification. Fortunately, there are many similarities between crop similar disease identification and FGIR. The FGIR focuses on how to effectively represent the discriminative features between the subordinate classes (<xref ref-type="bibr" rid="B6">Ding et&#xa0;al., 2019</xref>). Therefore, the discriminative region localization and feature representation methods in FGIR can be extended to crop similar disease identification.</p>
<p>In this paper, we cite the field strawberry disease identification as our study object and explore innovative methods to address the challenges caused by the complex background and similar diseases. First, we enhance the ability of the CNN backbone to localize discriminative regions through a new class-attention-based mechanism (i.e., class response map). Second, we construct the COLM based on the flood-fill algorithm to filter out the most useful lesion region from the complex background. Third, we raise a new lesion part proposal method (i.e., the LPPM) to propose the discriminative lesion details based on the RPA. The COLM and LPPM are connected in series to form a Class-Attention-based Lesion Proposal Convolutional Neural Network (CALP-CNN), which can simultaneously address the challenges caused by complex background and similar diseases in field disease identification.</p>
<p>A series of experiments are conducted on the constructed field strawberry common disease dataset to testify the effectiveness of the CALP-CNN in eliminating the interference from the complicated background and distinguishing similar strawberry diseases. The classification result on <italic>F</italic>1-<italic>score</italic> reaches 91.96%, which is greatly higher than other methods, showing that the proposed model outperforms other state-of-the-art methods in the view of field strawberry disease identification. In addition, the ablation results on <italic>F</italic>1-<italic>score</italic> drop to 87.94% and 88.42%, respectively, when the COLM and LPPM branches in the CALP-CNN are removed. It indicates that both background feature elimination and discriminative lesion detail feature representation are indispensable for field disease identification.</p>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>XH, RW, JD, and TX conceived the idea and designed the network. XH, LJ, and YH contributed to collecting the dataset. XH wrote the code, validated the method, and wrote the paper. TX, JD and LJ revised the paper. All authors contributed to the article and approved the submitted version.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>This work was supported by the National Key Research and Development Program of China-Intergovernmental International Scientific and Technological Innovation Cooperation (2019YFE0125700).</p>
</sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arsenovic</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Karanovic</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sladojevic</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Anderla</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Stefanovic</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Solving current limitations of deep learning based approaches for plant disease detection</article-title>. <source>Symmetry</source> <volume>11</volume>, <fpage>939</fpage>&#x2013;<lpage>960</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/sym11070939</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barbedo</surname> <given-names>J. G.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Factors influencing the use of deep learning for plant disease recognition</article-title>. <source>Biosyst. Eng.</source> <volume>172</volume>, <fpage>84</fpage>&#x2013;<lpage>91</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2018.05.013</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>L.-C.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Papandreou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Schroff</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Encoder-decoder with atrous separable convolution for semantic image segmentation</article-title>,&#x201d; in <conf-name>Proceedings of the European Conference on Computer Vision (ECCV)</conf-name>, <fpage>801</fpage>&#x2013;<lpage>818</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1802.02611</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cruz</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Ampatzidis</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Pierro</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Materazzi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Panattoni</surname> <given-names>A.</given-names>
</name>
<name>
<surname>De Bellis</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Detection of grapevine yellows symptoms in vitis vinifera l. with artificial intelligence</article-title>. <source>Comput. Electron. Agric.</source> <volume>157</volume>, <fpage>63</fpage>&#x2013;<lpage>76</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.12.028</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dabkowski</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Gal</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Real time image saliency for black box classifiers</article-title>,&#x201d; in <source>Proceedings of the 31st International Conference on Neural Information Processing Systems</source>, Vol. <volume>NIPS&#x2019;17</volume>. <fpage>6970</fpage>&#x2013;<lpage>6979</lpage> (Curran Associates Inc: Red Hook, NY, USA). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1705.07857</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ding</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Selective sparse sampling for fine-grained image recognition</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (IEEE). <fpage>6599</fpage>&#x2013;<lpage>6608</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2019.00670</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dwivedi</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Vijh</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Chaturvedi</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Study of machine learning techniques for plant disease recognition in agriculture</article-title>,&#x201d; in <source>2021 11th International Conference on Cloud Computing, Data Science Engineering (Confluence) (IEEE)</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/Confluence51648.2021.9377186</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ferentinos</surname> <given-names>K. P.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Deep learning models for plant disease detection and diagnosis</article-title>. <source>Comput. Electron. Agric.</source> <volume>145</volume>, <fpage>311</fpage>&#x2013;<lpage>318</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.01.009</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Fu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Mei</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Look closer to see better: Recurrent attention convolutional neural network for fine-grained image recognition</article-title>,&#x201d; in <source>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Los Alamitos, CA, USA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>), <fpage>4476</fpage>&#x2013;<lpage>4484</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2017.476</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hannum</surname> <given-names>S. M.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Potential impact of strawberries on human health: A review of the science</article-title>. <source>Crit. Rev. Food Sci. Nutr.</source> <volume>44</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1080/10408690490263756</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <source>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-name>IEEE</publisher-name>). <fpage>770</fpage>&#x2013;<lpage>778</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>K. Y.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Application of artificial neural network for detecting phalaenopsis seedling diseases using color and texture features</article-title>. <source>Comput. Electron. Agric.</source> <volume>57</volume>, <fpage>3</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2007.01.015</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Bao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Estimation of tea leaf blight severity in natural scene images</article-title>. <source>Precis. Agric.</source> <volume>22</volume>, <fpage>1239</fpage>&#x2013;<lpage>1262</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-020-09782-8</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Iqbal</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Jamshaid</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zahid</surname> <given-names>M. A.</given-names>
</name>
<name>
<surname>Andreasson</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Vetukuri</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Stenberg</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Biological control of strawberry crown rot, root rot and grey mould by the beneficial fungus aureobasidium pullulans</article-title>. <source>BioControl</source> <volume>66</volume>, <fpage>535</fpage>&#x2013;<lpage>545</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10526-021-10083-w</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Iqbal</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>M. A.</given-names>
</name>
<name>
<surname>Sharif</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Shah</surname> <given-names>J. H.</given-names>
</name>
<name>
<surname>ur Rehman</surname> <given-names>,. M. H.</given-names>
</name>
<name>
<surname>Javed</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>An automated detection and classification of citrus plant diseases using image processing techniques: A review</article-title>. <source>Comput. Electron. Agric.</source> <volume>153</volume>, <fpage>12</fpage>&#x2013;<lpage>32</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.07.032</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Johannes</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Picon</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Alvarez-Gila</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Echazarra</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Rodriguez-Vaamonde</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Navajas</surname> <given-names>A. D.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Automatic plant disease diagnosis using mobile capture devices, applied on a wheat use case</article-title>. <source>Comput. Electron. Agric.</source> <volume>138</volume>, <fpage>200</fpage>&#x2013;<lpage>209</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2017.04.013</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kaur</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Aggarwal</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Verma</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Detection and classification of disease affected region of plant leaves using image processing technique</article-title>. <source>Indian J. Sci. Technol.</source> <volume>9</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.17485/ijst/2016/v9i48/104765</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname> <given-names>D. G.</given-names>
</name>
<name>
<surname>Burks</surname> <given-names>T. F.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Bulanon</surname> <given-names>D. M.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Classification of grapefruit peel diseases using color texture feature analysis</article-title>. <source>Int. J. Agric. Biol. Eng.</source> <volume>2</volume>, <fpage>41</fpage>&#x2013;<lpage>50</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3965/j.issn.1934-6344.2009.03.041-050</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Krizhevsky</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Sutskever</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Hinton</surname> <given-names>G. E.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Imagenet classification with deep convolutional neural networks</article-title>,&#x201d; in <source>Advances in neural information processing systems</source>, vol. <volume>25</volume> . Eds. <person-group person-group-type="editor">
<name>
<surname>Pereira</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Burges</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Bottou</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Weinberger</surname> <given-names>K.</given-names>
</name>
</person-group> (<publisher-name>Curran Associates, Inc</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3065386</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liang</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Xiang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Coppola</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Pd2se-net: Computer-assisted plant disease diagnosis and severity estimation network</article-title>. <source>Comput. Electron. Agric.</source> <volume>157</volume>, <fpage>518</fpage>&#x2013;<lpage>529</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2019.01.034</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Maji</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Rahtu</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Kannala</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Blaschko</surname> <given-names>M. B.</given-names>
</name>
<name>
<surname>Vedaldi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Fine-grained visual classification of aircraft</article-title>. <source>ArXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1306.5151</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mohanty</surname> <given-names>S. P.</given-names>
</name>
<name>
<surname>Hughes</surname> <given-names>D. P.</given-names>
</name>
<name>
<surname>Salath&#xe9;</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Using deep learning for image-based plant disease detection</article-title>. <source>Front. Plant Sci.</source> <volume>7</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2016.01419</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ngugi</surname> <given-names>L. C.</given-names>
</name>
<name>
<surname>Abdelwahab</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Abo-Zahhad</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tomato leaf segmentation algorithms for mobile phone applications using deep learning</article-title>. <source>Comput. Electron. Agric.</source> <volume>178</volume>, <elocation-id>105788</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105788</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Massa</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Lerer</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Bradbury</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chanan</surname> <given-names>G.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). &#x201c;<article-title>Pytorch: An imperative style, high-performance deep learning library</article-title>,&#x201d; in <source>Advances in neural information processing systems</source>, vol. <volume>32</volume> . Eds. <person-group person-group-type="editor">
<name>
<surname>Wallach</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Larochelle</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Beygelzimer</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Alch&#xe9;-Buc</surname> <given-names>F. D.</given-names>
</name>
<name>
<surname>Fox</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Garnett</surname> <given-names>R.</given-names>
</name>
</person-group> (<publisher-name>Curran Associates, Inc</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1912.01703</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Picon</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Alvarez-Gila</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Seitz</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Ortiz-Barredo</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Echazarra</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Johannes</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deep convolutional neural networks for mobile capture device-based crop disease classification in the wild</article-title>. <source>Comput. Electron. Agric.</source> <volume>161</volume>, <fpage>280</fpage>&#x2013;<lpage>290</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.04.002</pub-id>. BigData and DSS in Agriculture.</citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Recasens</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kellnhofer</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Stent</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Matusik</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Torralba</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Learning to zoom: a saliency-based sampling layer for neural networks</article-title>,&#x201d; in <source>Proceedings of the European Conference on Computer Vision (ECCV)</source> (<publisher-name>Springer International Publishing</publisher-name>), <fpage>51</fpage>&#x2013;<lpage>66</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-01240-3\s\do5(4</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Faster r-cnn: Towards real-time object detection with region proposal networks</article-title>,&#x201d; in <source>Advances in neural information processing systems</source>, vol. <volume>28</volume> . Eds. <person-group person-group-type="editor">
<name>
<surname>Cortes</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Lawrence</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Sugiyama</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Garnett</surname> <given-names>R.</given-names>
</name>
</person-group> (<publisher-name>Curran Associates, Inc</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Revathi</surname> <given-names>P. B.</given-names>
</name>
<name>
<surname>Hemalatha</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Cotton leaf spot diseases detection utilizing feature selection with skew divergence method</article-title>. <source>Int. J. Sci. Eng. Technol.</source> <volume>3</volume>, <fpage>22</fpage>&#x2013;<lpage>30</lpage>. Available at: <uri xlink:href="https://www.ijset.com/publication/v3/005.pdf">https://www.ijset.com/publication/v3/005.pdf</uri>.</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). <source>U-Net: Convolutional networks for biomedical image segmentation</source> (<publisher-name>Springer International Publishing</publisher-name>), <fpage>234</fpage>&#x2013;<lpage>241</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-24574-4\s\do5(2)8</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sankaran</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Mishra</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Ehsani</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Davis</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>A review of advanced techniques for detecting plant diseases</article-title>. <source>Comput. Electron. Agric.</source> <volume>72</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2010.02.007</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Selvaraju</surname> <given-names>R. R.</given-names>
</name>
<name>
<surname>Cogswell</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Das</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Vedantam</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Parikh</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Batra</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Grad-cam: Visual explanations from deep networks <italic>via</italic> gradient-based localization</article-title>,&#x201d; in <source>2017 IEEE International Conference on Computer Vision (ICCV)</source> (<publisher-name>IEEE</publisher-name>). <fpage>618</fpage>&#x2013;<lpage>626</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.74</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Srdjan.</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Marko</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Anderla</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Dubravko</surname> <given-names>,.u.</given-names>
</name>
<name>
<surname>Stefanovic</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Deep neural networks based recognition of plant diseases by leaf image classification</article-title>. <source>Comput. Intell. Neurosci</source> <elocation-id>3289801</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1155/2016/3289801</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Szegedy</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sermanet</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Reed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Anguelov</surname> <given-names>D.</given-names>
</name>
<etal/>
</person-group>. (<year>2015</year>). &#x201c;<article-title>Going deeper with convolutions</article-title>,&#x201d; in <source>2015 IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>9</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2015.7298594</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Too</surname> <given-names>E. C.</given-names>
</name>
<name>
<surname>Yujian</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Njuki</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yingchun</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A comparative study of fine-tuning deep learning models for plant disease identification</article-title>. <source>Comput. Electron. Agric.</source> <volume>161</volume>, <fpage>272</fpage>&#x2013;<lpage>279</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.03.032</pub-id>. BigData and DSS in Agriculture.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Di</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>b). <article-title>Dissipation, accumulation and risk assessment of fungicides after repeated spraying on greenhouse strawberry</article-title>. <source>Sci. Total Environ.</source> <volume>758</volume>, <fpage>144</fpage>&#x2013;<lpage>153</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.scitotenv.2020.144067</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>a). <article-title>A cucumber leaf disease severity classification method based on the fusion of deeplabv3+ and u-net</article-title>. <source>Comput. Electron. Agric.</source> <volume>189</volume>, <elocation-id>106373</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106373</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname> <given-names>X.-S.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>J.-H.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Z.-H.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Selective convolutional descriptor aggregation for fine-grained image retrieval</article-title>. <source>IEEE Trans. Image Process.</source> <volume>26</volume>, <fpage>2868</fpage>&#x2013;<lpage>2881</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIP.2017.2688133</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Welinder</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Branson</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Mita</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Wah</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Perona</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2010</year>). <source>Caltech-Ucsd birds 200</source> (<publisher-name>california institute of technology</publisher-name>). Available at: <uri xlink:href="https://authors.library.caltech.edu/27468/1/WelinderEtal10_CUB-200.pdf">https://authors.library.caltech.edu/27468/1/WelinderEtal10_CUB-200.pdf</uri>.</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>G. F.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zi-Kang</surname> <given-names>H. E.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X. Y.</given-names>
</name>
<name>
<surname>Yong</surname> <given-names>H. E.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A rapid, low-cost deep learning system to classify strawberry disease based on cloud service</article-title>. <source>J. Of Integr. Agric.</source> <volume>21</volume>, <fpage>460</fpage>&#x2013;<lpage>473</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S2095-3119(21)63604-3</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zeiler</surname> <given-names>M. D.</given-names>
</name>
<name>
<surname>Fergus</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Visualizing and understanding convolutional networks</article-title>,&#x201d; in <conf-name>Computer Vision &#x2013; ECCV 2014</conf-name> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>818</fpage>&#x2013;<lpage>833</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-10590-1\s\do5(5)3</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Multi-branch and multi-scale attention learning for fine-grained visual categorization</article-title>,&#x201d; in <source>International Conference on Multimedia Modeling</source>. <fpage>136</fpage>&#x2013;<lpage>147</lpage> (Springer). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2003.09150</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Multiple disease detection method for greenhouse-cultivated strawberry based on multiscale feature fusion faster r-cnn</article-title>. <source>Comput. Electron. Agric</source> <volume>
199</volume>, <elocation-id>107176</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107176</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Mei</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Learning multi-attention convolutional neural network for fine-grained image recognition</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Computer Vision (ICCV)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>5219</fpage>&#x2013;<lpage>5227</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.557</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zha</surname> <given-names>Z.-J.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Looking for the devil in the details: Learning trilinear attention sampling network for fine-grained image recognition</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-name>IEEE</publisher-name>), doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00515</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Khosla</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Lapedriza</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Oliva</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Torralba</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Learning deep features for discriminative localization</article-title>,&#x201d; in <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>2921</fpage>&#x2013;<lpage>2929</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.319</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>