<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2021.731688</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A Plant Disease Recognition Method Based on Fusion of Images and Graph Structure Text</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Chunshan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1375561/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhou</surname> <given-names>Ji</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhang</surname> <given-names>Yan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wu</surname> <given-names>Huarui</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhao</surname> <given-names>Chunjiang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Teng</surname> <given-names>Guifa</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Jiuxi</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>National Engineering Research Center for Information Technology in Agriculture</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>School of Information Science and Technology, Hebei Agricultural University</institution>, <addr-line>Baoding</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Beijing Research Center for Information Technology in Agriculture</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Hebei Key Laboratory of Agricultural Big Data</institution>, <addr-line>Baoding</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>School of Mechanical and Electrical Engineering, Hebei Agricultural University</institution>, <addr-line>Baoding</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Lei Shu, Nanjing Agricultural University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Jucheng Yang, Tianjin University of Science and Technology, China; Chengcheng Chen, Jilin University, China</p></fn>
<corresp id="c001">&#x002A;Correspondence: Huarui Wu, <email>wuhr@nercita.org.cn</email></corresp>
<corresp id="c002">Chunjiang Zhao, <email>zhaocj@nercita.org.cn</email></corresp>
<fn fn-type="other" id="fn004"><p>This article was submitted to Sustainable and Intelligent Phytoprotection, a section of the journal Frontiers in Plant Science</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>14</day>
<month>01</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>731688</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>08</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>12</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2022 Wang, Zhou, Zhang, Wu, Zhao, Teng and Li.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Wang, Zhou, Zhang, Wu, Zhao, Teng and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>The disease image recognition models based on deep learning have achieved relative success under limited and restricted conditions, but such models are generally subjected to the shortcoming of weak robustness. The model accuracy would decrease obviously when recognizing disease images with complex backgrounds under field conditions. Moreover, most of the models based on deep learning only involve characterization learning on visual information in the image form, while the expression of other modal information rather than the image form is often ignored. The present study targeted the main invasive diseases in tomato and cucumber as the research object. Firstly, in response to the problem of weak robustness, a feature decomposition and recombination method was proposed to allow the model to learn image features at different granularities so as to accurately recognize different test images. Secondly, by extracting the disease feature words from the disease text description information composed of continuous vectors and recombining them into the disease graph structure text, the graph convolutional neural network (GCN) was then applied for feature learning. Finally, a vegetable disease recognition model based on the fusion of images and graph structure text was constructed. The results show that the recognition accuracy, precision, sensitivity, and specificity of the proposed model were 97.62, 92.81, 98.54, and 93.57%, respectively. This study improved the model robustness to a certain extent, and provides ideas and references for the research on the fusion method of image information and graph structure information in disease recognition.</p>
</abstract>
<kwd-group>
<kwd>disease recognition</kwd>
<kwd>graph convolutional neural network</kwd>
<kwd>text recognition</kwd>
<kwd>robustness</kwd>
<kwd>fusion</kwd>
</kwd-group>
<counts>
<fig-count count="9"/>
<table-count count="7"/>
<equation-count count="11"/>
<ref-count count="25"/>
<page-count count="12"/>
<word-count count="6575"/>
</counts>
</article-meta>
</front>
<body>
<sec id="S1" sec-type="intro">
<title>Introduction</title>
<p>Diseases, as one of the main factors affecting the growth of crops, can cause an average annual loss of crop yield up to more than 10%. Diseases not only directly lead to the loss of crop yield, but also severely affect the quality of agricultural products and even cause food safety incidents. Therefore, automatic recognition of crop diseases plays a significant role in diagnosing the disease type as early as possible, making correct control decisions and minimizing yield loss. Meanwhile, automatic disease recognition can also help mitigate the environmental impact of chemical inputs, reduce production costs, decrease agricultural workers&#x2019; exposure to pesticides, and promote healthy and sustainable agricultural development.</p>
<p>The advancement of machine learning technology provides new opportunities for the image recognition of crop diseases. This technology has been widely utilized to recognize crop diseases in recent years (e.g., <xref ref-type="bibr" rid="B17">Mohanty et al., 2016</xref>; <xref ref-type="bibr" rid="B6">Fuentes et al., 2017</xref>; <xref ref-type="bibr" rid="B22">Wang et al., 2017</xref>; <xref ref-type="bibr" rid="B5">Ferentinos, 2018</xref>; <xref ref-type="bibr" rid="B7">Geetharamani and Pandian, 2019</xref>; <xref ref-type="bibr" rid="B2">Chen et al., 2020</xref>; <xref ref-type="bibr" rid="B25">Zhong and Zhao, 2020</xref>). <xref ref-type="bibr" rid="B20">Too et al. (2019)</xref> used DenseNet (<xref ref-type="bibr" rid="B10">Huang et al., 2016</xref>) for disease classification. <xref ref-type="bibr" rid="B15">Li Z. et al. (2020)</xref> conducted vegetable disease recognition by combining SEnet (<xref ref-type="bibr" rid="B11">Jie et al., 2017</xref>) with InceptionV3 (<xref ref-type="bibr" rid="B19">Szegedy et al., 2016</xref>). Regarding disease detection, <xref ref-type="bibr" rid="B13">Li J. H. et al. (2020)</xref> and <xref ref-type="bibr" rid="B23">Yang et al. (2020)</xref> proposed to use the Faster-RCNN target detection network to replace the artificial disease spot extraction method for the task of disease spot detection. The studies above have achieved high recognition accuracy, but it is noteworthy that, in the datasets they used (whether public datasets with simple backgrounds or self-collected datasets with complex backgrounds), the disease features were mostly concentrated in the central area of the images. Thus, although the accuracy of the deep learning model after training was relatively high on the dataset with the same disease severity, the growth state of the diseased leaves and the difference in data collection time might affect the later recognition effect. This problem was also found in most of other image recognition models (<xref ref-type="bibr" rid="B4">Eitrich et al., 2007</xref>; <xref ref-type="bibr" rid="B16">Menardi and Torelli, 2012</xref>). Therefore, improvement on the robustness of the recognition model is of great significance for practical application. Considering the impact of the locations of disease spots on the final recognition results, this study proposed a method of feature decomposition and recombination for constructing a secondary dataset. According to the difference in decomposition granularity, the diseased areas might appear randomly in any position of the image, so as to eliminate the impact of the location of disease spots on the robustness of the recognition model.</p>
<p>There are many modalities, such as image and text, can be used for recording and describing the features of crop diseases. Among various modalities, the RGB image modality can illustrate the visual features of the disease, which can be learned by deep convolutional neural networks; it is therefore the mainstream method of disease recognition at present. Another effective way to describe disease features is text, that is, to express the visual information in disease images in the form of text description. The advantage of the text modality is that text description is automatically focused on the key areas and features (e.g., leaves and disease spots) in the images. When describing disease features in the form of text, the knowledge graph can use structured data to perform pre-learning among different disease features, in order to simplify the learning process of text features. By fusing image information with text information to form complementary representation, it is possible to improve the performance of disease recognition. <xref ref-type="bibr" rid="B21">Wang et al. (2021)</xref> used the text vector form to represent non-image information of the disease, and combined with image information for joint training, which improved the utilization rate of non-image modal information. In tasks of fine-grained image recognition, <xref ref-type="bibr" rid="B18">Reed et al. (2016)</xref>, <xref ref-type="bibr" rid="B8">He and Peng (2017)</xref>, and <xref ref-type="bibr" rid="B9">He and Peng (2019)</xref> carried out image and text joint training by applying different training forms and feature expressions, which effectively solved the problem that the image modal expression was similar but the utilization rate of other modalities was weak in fine-grained image recognition. In aforementioned research works, other modal information rather than image data was mostly expressed in the form of text vectors to create the semantic relevance, while the features between categories were independent of each other, making text modeling relatively easy. However, in the field of disease recognition, there is a certain level of relevance between the information of different diseases. For example, the disease spots of cucumber downy mildew and cucumber bacterial angular leaf spot are both in shapes of polygons, and the disease spots of tomato powdery mildew and cucumber powdery mildew are both in white color. When independent text representation methods (e.g., bag-of-words model and Word2Vec) were used, the representations between different disease features were still independent of each other, making it impossible to establish connections between similar diseases. As a special data representation form, graph structure can be used to accurately describe the relationship between nodes. Therefore, compiling disease text information into graph structure information can greatly accelerate the learning process among various disease categories.</p>
<p>With the development and application of knowledge graph in practice, an increasing number of graph structure databases have been established. However, as knowledge graph is mostly created by human labor, its entity extraction and entity relationship extraction need to consume a lot of manpower and material resources. Thus, the graph neural network and graph convolutional neural network (GCN) (<xref ref-type="bibr" rid="B12">Kip and Welling, 2016</xref>; <xref ref-type="bibr" rid="B14">Li et al., 2018</xref>) based on graph structure were proposed, which could autonomously learn the relationship between entities in graph structure data so as to fully exert the advantages of data that is suitable for graph structure representation. <xref ref-type="bibr" rid="B3">Chen et al. (2019)</xref> performed image multi-label classification using GCN and modeled multi-label images with graph structure; eventually, they achieved higher recognition accuracy than other multi-label classification tasks. <xref ref-type="bibr" rid="B24">Yao et al. (2019)</xref> constructed a text graph structure based on text corpus by using the degree of adjacency between words and text words, and conducted GCN training on the text in the form of graph. They also achieved higher accuracy than other text classification methods. In the present study, a disease text graph structure was constructed according to the number of adjacency times between disease feature words and the overall disease description text. Then, by fusing the convolutional neural network with the GCN, a vegetable disease recognition model based on feature decomposition and recombination of images and graph structure text was proposed. The main contributions of this paper are as follows:</p>
<list list-type="simple">
<list-item>
<label>1.</label>
<p>A vegetable disease recognition model with fusion of images and graph structure text was proposed, which could realize effective use of disease image information and disease description information.</p>
</list-item>
<list-item>
<label>2.</label>
<p>Aiming at the shortcomings of conventional disease recognition methods such as poor image modal discrimination and low information utilization rate, the knowledge text graph structure data was used for synchronized training, which provides a knowledge reference for the image recognition process.</p>
</list-item>
<list-item>
<label>3.</label>
<p>An image decomposition and recombination method was proposed, which could eliminate the impact of the location of disease spots on the recognition results and thereby improve the robustness of the model.</p>
</list-item>
</list>
</sec>
<sec id="S2" sec-type="materials|methods">
<title>Materials and Methods</title>
<sec id="S2.SS1">
<title>Data Acquisition</title>
<p>All the datasets used in the present study were acquired from the National Precision Agriculture Demonstration Base. The self-collected image data (covering six diseases: tomato powdery mildew, tomato early blight, cucumber powdery mildew, cucumber virus disease, cucumber downy mildew, and cucumber bacterial angular leaf spot) consisted of 1,715 leaf images, which were divided into the training set, validation set and test set according to the ratio of 7:2:1. Taking into account the impact of different lighting conditions on the image, the images were captured from June to November in three time periods: morning (7:00&#x2013;8:00), noon (11:00&#x2013;12:00), and evening (17:00&#x2013;18:00), as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The images format is JPG and captured by mobile phones, such as Huawei, iPhone, etc. Since the images are taken from different devices, in order to unify the image size, all images are resized to 224 &#x00D7; 224. The original disease description text consisted of 1,715 sentences, which were manually described by five plant protection experts. The disease graph structure was then constructed according to the number of adjacency times of disease words. The original disease text is shown in <xref ref-type="table" rid="T1">Table 1</xref>. The image-text pair is used only once in the training process.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>Example of dataset. <bold>(A)</bold> Tomato powdery mildew. <bold>(B)</bold> Tomato early blight. <bold>(C)</bold> Cucumber powdery mildew. <bold>(D)</bold> Cucumber virus disease. <bold>(E)</bold> Cucumber downy mildew. <bold>(F)</bold> Cucumber bacterial spot.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g001.tif"/>
</fig>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Example of original disease description text.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Disease categories</td>
<td valign="top" align="left">Describe text</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Tomato powdery mildew</td>
<td valign="top" align="left">There are two large and small white powder spots on the lower half of the surface of tomato leaves</td>
</tr>
<tr>
<td valign="top" align="left">Tomato early blight</td>
<td valign="top" align="left">On the front of the tomato leaf, there are several taupe spots with concentric rings</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber powdery mildew</td>
<td valign="top" align="left">Cucumber leaves have more white, powdery spots on the underside</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber virus disease</td>
<td valign="top" align="left">Cucumber leaves are striped on the front and wrinkled around the leaves</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber downy mildew</td>
<td valign="top" align="left">There is a small square yellowish-green spot on the surface of a cucumber leaf</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber bacterial spot</td>
<td valign="top" align="left">The surface of cucumber was damaged and evenly scattered with light yellow spots</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="S2.SS2">
<title>Decomposition and Recombination of Disease Features</title>
<sec id="S2.SS2.SSS1">
<title>Image Modality</title>
<p>In most of the disease images, the diseased leaf occupies the central area of the image; particularly, the learner usually regards the appearance of the disease spot in the center of the image as one of the features for disease recognition during the learning process. If the diseased area appeared in a non-central position, the recognition result might be subjected to bias. Aiming at this phenomenon, a feature decomposition and recombination method was proposed, which allowed the diseased area to randomly appear at any position of the image (see <xref ref-type="fig" rid="F2">Figure 2</xref> for the process flow).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>The feature decomposition and reorganization method for the image modality.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g002.tif"/>
</fig>
</sec>
<sec id="S2.SS2.SSS2">
<title>Text Modality</title>
<p>The disease images in this study were all collected from the field environment. Unlike the images collected from laboratories with simple backgrounds, our images contained not only diseased leaves, but also complex background information. Moreover, the background information might change with the growth of the plant. For example, in the seedling stage of the plant, the background information was mostly soil or ground film; in the flowering stage, the background information might contain flowers; and in the fruiting stage, the background information might contain fruits. As a result, there were significant intra-category differences but insignificant inter-category differences for the same type of disease. In addition, the background information of images captured from different environments (e.g., facility environment or open-field environment) also differed greatly. When the visual information of the disease displayed by the image was being re-described in the form of text, the descriptions mainly focused on the key features of diseased leaves and disease spots (e.g., shape, color, texture, and position), while the background information was no longer included. This process managed to decouple complex background information and disease visual features to a certain extent, and thereby solved the problem of reduced recognition accuracy caused by the confusion between the backgrounds and disease features. Further, the use of natural language to describe disease features is characterized with the advantage of natural flexibility (for example, white might be described as light white, gray-white, etc.), which diversifies the disease text description and improves the robustness of the recognition model. The conventional text vectorization methods mainly use continuous vectors or dense vectors to represent text words or characters. In such a situation, the words are arranged in a continuous form without any spatial relationship. In the present study, the disease feature words in the continuous text were extracted and recombined into graph structure data that carries a spatial relationship. The workflow of feature decomposition and recombination is shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>The feature decomposition and recombination method for the text modality.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g003.tif"/>
</fig>
</sec>
</sec>
<sec id="S2.SS3">
<title>Construction of the Vegetable Disease Recognition Model</title>
<p>For the convolutional neural network model based on image data, the disease features were extracted in the form of convolution kernel sliding, while for the GCN model based on graph data, the disease features were extracted based on the relationship between the graph structure and the features of the node itself. Taking into account the correlation between the number of model parameters and model accuracy, the convolutional neural network may be set with different numbers of layers for feature extraction. But in the GCN, due to the limitation of the number of node hops, a two-layer network structure would usually be sufficient to achieve an ideal effect. The network structure of the vegetable disease recognition model constructed in this study is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption><p>Network structure.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g004.tif"/>
</fig>
<sec id="S2.SS3.SSS1">
<title>Image Branch</title>
<p>In order to mitigate the impact of the location of the diseased area on the learning process, a given original image I was randomly segmented and disarranged at different granularities. The specific rules are shown in Eq. 1.</p>
<disp-formula id="S2.E1"><label>(1)</label><mml:math id="M1"><mml:mrow><mml:mover accent="true"><mml:mtext>I</mml:mtext><mml:mo stretchy="false">~</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mrow><mml:mtext>F</mml:mtext><mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi mathvariant="normal">I</mml:mi><mml:mo>/</mml:mo><mml:mi mathvariant="normal">s</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mtext>s</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn><mml:mo>,</mml:mo><mml:mn>8</mml:mn></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where F(&#x22C5;) refers to the recombination function after random disarrangement; <italic>S</italic> refers to the granularity of image segmentation. In this study, three granularities were set, namely 2, 4, and 8, and the corresponding number of image blocks after segmentation was 4, 16, and 64, respectively. As the granularity continued to increase, the level of image confusion would gradually increase.</p>
<p>After obtaining image blocks at different granularities upon segmentation, disarrangement, and recombination, the images were input into the convolutional neural network model, and the classification results were combined to generate the final recognition results. Then, the loss value was calculated and the parameters were updated based on the results. The specific training process is shown in Eq. 2.</p>
<disp-formula id="S2.E2"><label>(2)</label><mml:math id="M2"><mml:mrow><mml:msub><mml:mtext>P</mml:mtext><mml:mrow><mml:mtext>img</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>S</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munderover><mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>N</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn><mml:mo>,</mml:mo><mml:mn>8</mml:mn></mml:mrow></mml:mrow></mml:munderover><mml:msub><mml:mtext>P</mml:mtext><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where P<sub>img</sub> refers to the final classification results; P<sub>i</sub> refers to the classification results at different granularities; S(&#x22C5;) refers the softmax function. Since there were disease spots at different granularities in the training process, the location of disease spot was not fixed. Therefore, the model would be able to better adapt to complex and diverse disease images after the training process, so as to improve the robustness of the model.</p>
</sec>
<sec id="S2.SS3.SSS2">
<title>Graph Structure Branch</title>
<p>In order to better represent the relationship among disease attributes, feature words were extracted from the sequential text data and were recombined to form the graph structure data. Let G = {V,E} be a given graph dataset, where V refers to the node set andE refers to the edge set. Based on G, the adjacency matrix A and the degree matrix D can be obtained, and eventually, the feature matrix of the graph data can be obtained. Specifically, the feature matrix after the first layer of graph convolution L<sup>(1)</sup> can be derived through Eq. 3, and the feature matrix after the second layer of graph convolution L<sup>(2)</sup> can be derived through Eq. 4. Considering that the GCN might encounter the phenomenon of gradient disappearance or gradient explosion after multi-layer operations and in view of the characteristics of the disease dataset itself, only two graph convolution operations were performed in this study.</p>
<disp-formula id="S2.E3"><label>(3)</label><mml:math id="M3"><mml:mrow><mml:msup><mml:mtext>L</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">&#x03C1;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mtext>A</mml:mtext><mml:mo stretchy="false">~</mml:mo></mml:mover><mml:msub><mml:mtext>XW</mml:mtext><mml:mn>0</mml:mn></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<disp-formula id="S2.E4"><label>(4)</label><mml:math id="M4"><mml:mrow><mml:msup><mml:mtext>L</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">&#x03C1;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mtext>A</mml:mtext><mml:mo stretchy="false">~</mml:mo></mml:mover><mml:msup><mml:mtext>L</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:msub><mml:mtext>W</mml:mtext><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="INEQ9"><mml:mrow><mml:mover accent="true"><mml:mtext>A</mml:mtext><mml:mo stretchy="false">~</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mrow><mml:msup><mml:mtext>D</mml:mtext><mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:msup><mml:msup><mml:mtext>AD</mml:mtext><mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:msup></mml:mrow></mml:mrow></mml:math></inline-formula>; W<sub>0</sub> and W<sub>1</sub> refer to the weight matrix; &#x03C1;(&#x22C5;) refers to the activation function. In this study, the ReLU function was used. After the two-layer graph convolution operation, the classification results of node data were obtained. The classification process is shown in Eq. 5.</p>
<disp-formula id="S2.E5"><label>(5)</label><mml:math id="M5"><mml:mrow><mml:mrow><mml:msub><mml:mtext>P</mml:mtext><mml:mrow><mml:mtext>graph</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>S</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mtext>A</mml:mtext><mml:mo stretchy="false">~</mml:mo></mml:mover><mml:mi mathvariant="normal">&#x03C1;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mtext>A</mml:mtext><mml:mo stretchy="false">~</mml:mo></mml:mover><mml:msub><mml:mtext>XW</mml:mtext><mml:mn>0</mml:mn></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mtext>W</mml:mtext><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mrow><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>Finally, the recognition results of the image branch and the graph structure branch were fused, making sure that the recognition process not only learned the visual features of the disease image, but also incorporated the visual features of the disease in text description. The specific classification process is shown in Eq. 6. Both loss functions of image branch and graph structure branch are cross entropy loss functions, as shown in Eq. 7.</p>
<disp-formula id="S2.E6"><label>(6)</label><mml:math id="M6"><mml:mrow><mml:mtext>P</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:msub><mml:mtext>P</mml:mtext><mml:mrow><mml:mtext>img</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mtext>P</mml:mtext><mml:mrow><mml:mtext>graph</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:math></disp-formula>
<disp-formula id="S2.E7"><label>(7)</label><mml:math id="M7"><mml:mrow><mml:mtext>Loss</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mrow><mml:munderover><mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo><mml:mrow><mml:mrow><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mi mathvariant="normal">j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:munderover><mml:mrow><mml:msubsup><mml:mtext>y</mml:mtext><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow><mml:mrow><mml:mi>img</mml:mi><mml:mo>/</mml:mo><mml:mi>graph</mml:mi></mml:mrow></mml:msubsup><mml:msubsup><mml:mtext>logP</mml:mtext><mml:mrow><mml:mtext>j</mml:mtext></mml:mrow><mml:mrow><mml:mi>img</mml:mi><mml:mo>/</mml:mo><mml:mi>graph</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
</sec>
</sec>
</sec>
<sec id="S3">
<title>Experiment</title>
<p>Both the study experiment and control experiment were carried out in the Ubuntu 18.04 environment: processor Intel core i9 9820X; memory 64G; graphics card GeForce RTX 2080Ti 11G DDR6. The deep learning framework Pytorch, in combination with Cuda9.0, was used for training. The batch-size of the training set and the validation set during the experiment design and the control process was set to be 16 and 8, respectively, based on the number of network parameters. The number of iterations of all network models was set to 50. The learning rate for model training is set to 0.0001, and the optimizer uses Adam. In addition, all models in the image branch adopt their corresponding network structure, and the number of final output layer classes is modified to 6. In order to ensure the fairness of performance comparison, all models do not use pre-training models.</p>
<sec id="S3.SS1">
<title>Evaluation Indicators</title>
<p>The models were compared from four aspects: recognition accuracy, recognition precision, sensitivity, and specificity. See Eqs 8&#x2013;11 for the corresponding formulas.</p>
<disp-formula id="S3.E8"><label>(8)</label><mml:math id="M8"><mml:mrow><mml:mtext>Accuracy</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>TN</mml:mtext></mml:mrow><mml:mrow><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>TN</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FN</mml:mtext></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mn>100</mml:mn><mml:mo>%</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<disp-formula id="S3.E9"><label>(9)</label><mml:math id="M9"><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:mfrac><mml:mtext>TP</mml:mtext><mml:mrow><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FP</mml:mtext></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mn>100</mml:mn><mml:mo>%</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<disp-formula id="S3.E10"><label>(10)</label><mml:math id="M10"><mml:mrow><mml:mtext>Sensitivity</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:mfrac><mml:mtext>TP</mml:mtext><mml:mrow><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FN</mml:mtext></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mn>100</mml:mn><mml:mo>%</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<disp-formula id="S3.E11"><label>(11)</label><mml:math id="M11"><mml:mrow><mml:mtext>Specificity</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:mfrac><mml:mtext>TN</mml:mtext><mml:mrow><mml:mtext>FP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>TN</mml:mtext></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mn>100</mml:mn><mml:mo>%</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where TP refers to the number of samples belonging to category C and were correctly classified by the classifier; FP refers to the number of samples not belonging to category C but were misclassified by the classifier as category C; TN refers to the number of samples not belonging to category C and were correctly classified by the classifier; FN refers to the number of samples belonging to category C but were misclassified by the classifier.</p>
</sec>
<sec id="S3.SS2">
<title>Comparison of Models for the Image Branch</title>
<p>In the separate training process of the convolutional neural network, the selected control networks were AlexNet, ResNet18, DenseNet169, MobileNet, and VGG19. The training process without feature decomposition and recombination of the original image is shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, and the training process with feature decomposition and recombination is shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. In order to validate whether the feature decomposition and recombination method could improve the robustness of the model, two image datasets (i.e., one dataset with the same disease severity and one dataset with different disease severities) were used respectively for testing. The comparisons of testing results on the two datasets are shown in <xref ref-type="table" rid="T2">Tables 2</xref>, <xref ref-type="table" rid="T3">3</xref> respectively.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption><p>The training process using original images.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g005.tif"/>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption><p>The training process with feature decomposition and recombination.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g006.tif"/>
</fig>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Test results on the dataset with the same disease severity.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td/>
<td valign="top" align="center" colspan="4">Original images</td>
<td valign="top" align="center" colspan="4">Images with feature decomposition and recombination</td>
</tr>
<tr>
<td/>
<td valign="top" colspan="4"><hr/></td>
<td valign="top" colspan="4"><hr/></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">Acc/%</td>
<td valign="top" align="center">Pre/%</td>
<td valign="top" align="center">Sen/%</td>
<td valign="top" align="center">Spe/%</td>
<td valign="top" align="center">Acc/%</td>
<td valign="top" align="center">Pre/%</td>
<td valign="top" align="center">Sen/%</td>
<td valign="top" align="center">Spe/%</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">AlexNet</td>
<td valign="top" align="center">93.77</td>
<td valign="top" align="center">81.25</td>
<td valign="top" align="center">81.12</td>
<td valign="top" align="center">96.28</td>
<td valign="top" align="center">90.84</td>
<td valign="top" align="center">73.61</td>
<td valign="top" align="center">71.79</td>
<td valign="top" align="center">94.55</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DenseNet169</bold></td>
<td valign="top" align="center"><bold>95.05</bold></td>
<td valign="top" align="center"><bold>86.54</bold></td>
<td valign="top" align="center"><bold>85.03</bold></td>
<td valign="top" align="center"><bold>97.05</bold></td>
<td valign="top" align="center">94.14</td>
<td valign="top" align="center">83.27</td>
<td valign="top" align="center">82.22</td>
<td valign="top" align="center">96.52</td>
</tr>
<tr>
<td valign="top" align="left">MobileNet</td>
<td valign="top" align="center">91.03</td>
<td valign="top" align="center">74.31</td>
<td valign="top" align="center">72.39</td>
<td valign="top" align="center">94.55</td>
<td valign="top" align="center">92.49</td>
<td valign="top" align="center">76.79</td>
<td valign="top" align="center">76.52</td>
<td valign="top" align="center">95.49</td>
</tr>
<tr>
<td valign="top" align="left"><bold>ResNet18</bold></td>
<td valign="top" align="center">94.14</td>
<td valign="top" align="center">83.72</td>
<td valign="top" align="center">81.77</td>
<td valign="top" align="center">96.46</td>
<td valign="top" align="center"><bold>95.42</bold></td>
<td valign="top" align="center"><bold>87.50</bold></td>
<td valign="top" align="center"><bold>85.63</bold></td>
<td valign="top" align="center"><bold>97.27</bold></td>
</tr>
<tr>
<td valign="top" align="left">VGG19</td>
<td valign="top" align="center">89.74</td>
<td valign="top" align="center">67.93</td>
<td valign="top" align="center">68.24</td>
<td valign="top" align="center">93.87</td>
<td valign="top" align="center">89.56</td>
<td valign="top" align="center">67.70</td>
<td valign="top" align="center">67.28</td>
<td valign="top" align="center">93.71</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p><italic>Bold represents that the comprehensive performance metrics of the model or scheme is the best.</italic></p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Test results on the dataset with different disease severities.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td/>
<td valign="top" align="center" colspan="4">Original images</td>
<td valign="top" align="center" colspan="4">Images with feature decomposition and recombination</td>
</tr>
<tr>
<td/>
<td valign="top" colspan="4"><hr/></td>
<td valign="top" colspan="4"><hr/></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">Acc/%</td>
<td valign="top" align="center">Pre/%</td>
<td valign="top" align="center">Sen/%</td>
<td valign="top" align="center">Spe/%</td>
<td valign="top" align="center">Acc/%</td>
<td valign="top" align="center">Pre/%</td>
<td valign="top" align="center">Sen/%</td>
<td valign="top" align="center">Spe/%</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">AlexNet</td>
<td valign="top" align="center">87.07</td>
<td valign="top" align="center">65.68</td>
<td valign="top" align="center">66.44</td>
<td valign="top" align="center">92.21</td>
<td valign="top" align="center">87.87</td>
<td valign="top" align="center">68.66</td>
<td valign="top" align="center">67.93</td>
<td valign="top" align="center">92.64</td>
</tr>
<tr>
<td valign="top" align="left">DenseNet169</td>
<td valign="top" align="center">87.55</td>
<td valign="top" align="center">64.32</td>
<td valign="top" align="center">65.98</td>
<td valign="top" align="center">92.58</td>
<td valign="top" align="center">85.70</td>
<td valign="top" align="center">60.31</td>
<td valign="top" align="center">64.50</td>
<td valign="top" align="center">91.34</td>
</tr>
<tr>
<td valign="top" align="left">MobileNet</td>
<td valign="top" align="center">86.10</td>
<td valign="top" align="center">55.82</td>
<td valign="top" align="center">60.83</td>
<td valign="top" align="center">91.78</td>
<td valign="top" align="center">86.51</td>
<td valign="top" align="center">61.41</td>
<td valign="top" align="center">62.41</td>
<td valign="top" align="center">91.92</td>
</tr>
<tr>
<td valign="top" align="left"><bold>ResNet18</bold></td>
<td valign="top" align="center"><bold>88.51</bold></td>
<td valign="top" align="center"><bold>65.33</bold></td>
<td valign="top" align="center"><bold>68.13</bold></td>
<td valign="top" align="center"><bold>93.11</bold></td>
<td valign="top" align="center"><bold>89.32</bold></td>
<td valign="top" align="center"><bold>71.71</bold></td>
<td valign="top" align="center"><bold>70.42</bold></td>
<td valign="top" align="center"><bold>93.55</bold></td>
</tr>
<tr>
<td valign="top" align="left">VGG19</td>
<td valign="top" align="center">82.41</td>
<td valign="top" align="center">54.56</td>
<td valign="top" align="center">52.16</td>
<td valign="top" align="center">89.41</td>
<td valign="top" align="center">82.89</td>
<td valign="top" align="center">52.94</td>
<td valign="top" align="center">52.48</td>
<td valign="top" align="center">89.78</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p><italic>Bold represents that the comprehensive performance metrics of the model or scheme is the best.</italic></p></fn>
</table-wrap-foot>
</table-wrap>
<p>During the training process, the images would become more complex after feature decomposition and recombination, and depending on the segmentation granularity, the same disease spot might be segmented into different blocks. Thus, the overall training burden was increased. Therefore, it can be seen from <xref ref-type="fig" rid="F5">Figures 5</xref>, <xref ref-type="fig" rid="F6">6</xref> that the model trained by original images achieved higher accuracy and lower loss. However, according to the test results on two different datasets, it was found that, although the model trained by original images performed better on the dataset with the same disease severity, but on the dataset with different disease severities, most of the models trained with feature decomposition and recombination achieved better outcomes. This proves that the feature decomposition and recombination method can contribute to the robustness of the model.</p>
<p>In the image branch, the feature decomposition and recombination method was used for model training. In order to identify the impact of different segmentation granularities on the recognition results, ResNet18 was chosen as the basic feature extraction network, and the granularity was set to 2, 4, 8, and the superimposition of the three (i.e., 2 + 4 + 8). The two datasets as mentioned earlier were used for testing, and the test results are shown in <xref ref-type="table" rid="T4">Table 4</xref>.</p>
<table-wrap position="float" id="T4">
<label>TABLE 4</label>
<caption><p>Comparison of different segmentation granularities.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Split granularity</td>
<td valign="top" align="center" colspan="4">Test datasets with the same disease severity<hr/></td>
<td valign="top" align="center" colspan="4">Test datasets with the different disease severity<hr/></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">Acc/%</td>
<td valign="top" align="center">Pre/%</td>
<td valign="top" align="center">Sen/%</td>
<td valign="top" align="center">Spe/%</td>
<td valign="top" align="center">Acc/%</td>
<td valign="top" align="center">Pre/%</td>
<td valign="top" align="center">Sen/%</td>
<td valign="top" align="center">Spe/%</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="center">95.05</td>
<td valign="top" align="center">86.86</td>
<td valign="top" align="center">84.61</td>
<td valign="top" align="center">97.05</td>
<td valign="top" align="center">84.98</td>
<td valign="top" align="center">56.54</td>
<td valign="top" align="center">60.86</td>
<td valign="top" align="center">91.04</td>
</tr>
<tr>
<td valign="top" align="left">4</td>
<td valign="top" align="center">94.14</td>
<td valign="top" align="center">82.94</td>
<td valign="top" align="center">82.05</td>
<td valign="top" align="center">96.53</td>
<td valign="top" align="center">85.86</td>
<td valign="top" align="center">61.44</td>
<td valign="top" align="center">62.47</td>
<td valign="top" align="center">91.63</td>
</tr>
<tr>
<td valign="top" align="left">8</td>
<td valign="top" align="center">93.04</td>
<td valign="top" align="center">81.94</td>
<td valign="top" align="center">79.16</td>
<td valign="top" align="center">95.87</td>
<td valign="top" align="center">83.69</td>
<td valign="top" align="center">55.72</td>
<td valign="top" align="center">56.25</td>
<td valign="top" align="center">90.48</td>
</tr>
<tr>
<td valign="top" align="left"><bold>2 + 4 + 8</bold></td>
<td valign="top" align="center"><bold>95.42</bold></td>
<td valign="top" align="center"><bold>87.50</bold></td>
<td valign="top" align="center"><bold>85.63</bold></td>
<td valign="top" align="center"><bold>97.27</bold></td>
<td valign="top" align="center"><bold>89.32</bold></td>
<td valign="top" align="center"><bold>71.71</bold></td>
<td valign="top" align="center"><bold>70.42</bold></td>
<td valign="top" align="center"><bold>93.55</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p><italic>Bold represents that the comprehensive performance metrics of the model or scheme is the best.</italic></p></fn>
</table-wrap-foot>
</table-wrap>
<p>It can be seen from <xref ref-type="table" rid="T4">Table 4</xref> that, on the test set with the same disease severity, granularity 2 achieved the best outcome, whereas on the test set with different disease severities, granularity 4 achieved the best outcome. With feature decomposition and recombination, the complexity of the image would increase with the increase of the segmentation granularity. As a result, large diseased areas might be randomly divided into any position of the image, which increased the training difficulty when using the dataset with the same disease severity. But on the dataset with different disease severities, the recognition outcomes appeared to be different. If the segmentation granularity was reasonable, the performance of the model would be improved as the segmentation granularity increased. Moreover, with the superimposition of different granularities (2 + 4 + 8), the model could achieve a better outcome than the optimal granularity. In order to observe the regions of interest of the model on the disease image, Grad-cam++ (<xref ref-type="bibr" rid="B1">Chattopadhay et al., 2018</xref>) was used to visualize the model, and the results are shown in <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption><p>Visualization of the model&#x2019;s region of interest. <bold>(A)</bold> Original image. <bold>(B)</bold> Granularity 2. <bold>(C)</bold> Granularity 4. <bold>(D)</bold> Granularity 8. <bold>(E)</bold> Granularity 2 + 4 + 8.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g007.tif"/>
</fig>
<p>It can be seen from <xref ref-type="fig" rid="F7">Figure 7</xref> that the models trained at all granularities could accurately recognize the diseased area. Models trained at granularity 4 and 8 had similar regions of interest, while models trained at granularity 2 and 2 + 4 + 8 had similar regions of interest. The model trained at a larger granularity was more sensitive to larger diseased areas, whereas the model trained at a smaller granularity was more sensitive to smaller diseased areas. Furthermore, with the superimposition of different granularities, the model was more likely to be affected by the small-granularity segmentation model. By comprehensively considering the recognition accuracy of different models on the test set and the model&#x2019;s regions of interest, it was found that the models trained at different granularities had their respective advantages. Therefore, in this study, the feature segmentation for the image branch integrated different granularities in order to achieve accurate acquisition and learning on the diseased area.</p>
</sec>
<sec id="S3.SS3">
<title>Comparison of Models for the Graph Structure Branch</title>
<p>The GCN takes the features of the current node itself and the relationship between the current node and its neighbors as the network training parameters. The features of the current node are always updated based on the features of the previous node. Thus, the number of layers of the graph neural network (i.e., the number of hops in the neighbors of the node) is very important to the final outcome of the model. In this section, the number of layers of the GCN was set to 1, 2, and 3, respectively. The training process is shown in <xref ref-type="fig" rid="F8">Figure 8</xref>, and the test results of the test set are shown in <xref ref-type="table" rid="T5">Table 5</xref>.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption><p>The GCN training process for different layers.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g008.tif"/>
</fig>
<table-wrap position="float" id="T5">
<label>TABLE 5</label>
<caption><p>The GCN test results for different layers.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td/>
<td valign="top" align="center">Accuracy/%</td>
<td valign="top" align="center">Precision/%</td>
<td valign="top" align="center">Sensitivity/%</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">GCN_1layer</td>
<td valign="top" align="center">45.60</td>
<td valign="top" align="center">45.38</td>
<td valign="top" align="center">45.62</td>
</tr>
<tr>
<td valign="top" align="left">GCN_2layer</td>
<td valign="top" align="center">82.42</td>
<td valign="top" align="center">82.83</td>
<td valign="top" align="center">82.47</td>
</tr>
<tr>
<td valign="top" align="left">GCN_3layer</td>
<td valign="top" align="center">43.96</td>
<td valign="top" align="center">42.46</td>
<td valign="top" align="center">43.22</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>It can be seen from <xref ref-type="fig" rid="F8">Figure 8</xref> and <xref ref-type="table" rid="T5">Table 5</xref> that, on the basis of the same number of training iterations, the two-layer GCN model was significantly advantageous to the one-layer and three-layer GCN models in terms of accuracy, precision, sensitivity, and loss. Because in the graph structure dataset based on disease knowledge, most of the disease information is directly related to the disease category, its adjacency matrix has therefore a larger weight. In addition, <xref ref-type="bibr" rid="B12">Kip and Welling (2016)</xref> and <xref ref-type="bibr" rid="B14">Li et al. (2018)</xref> also reported that a GCN that was too deep would lead to the problems of gradient disappearance or gradient explosion, while a GCN that was too shallow would lead to poor performance due to fewer learning features. Therefore in this study, the two-layer GCN was chosen for knowledge supplementation and fusion with the convolutional neural network.</p>
<p>In order to demonstrate the effectiveness of the graph structure text in the process of disease recognition and to further explain the basis of judgments, this section classified the diseases according to word nodes and text nodes, and then, the top five words in the word node with the highest recognition confidence were extracted. The words with the highest confidence among different categories are summarized in <xref ref-type="table" rid="T6">Table 6</xref>. In the text node, different text nodes were clustered according to the final node representation of the model and clustering method is T-distributed Stochastic Neighbor Embedding (t-SNE). The clustering results are shown in <xref ref-type="fig" rid="F9">Figure 9</xref>.</p>
<table-wrap position="float" id="T6">
<label>TABLE 6</label>
<caption><p>Disease feature words.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Tomato powdery mildew</td>
<td valign="top" align="center">Tomato early blight</td>
<td valign="top" align="center">Cucumber powdery mildew</td>
<td valign="top" align="center">Cucumber virus disease</td>
<td valign="top" align="center">Cucumber downy mildew</td>
<td valign="top" align="center">Cucumber bacterial spot</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Powdery</td>
<td valign="top" align="center">Blobs</td>
<td valign="top" align="center">Protrusions</td>
<td valign="top" align="center">Watery</td>
<td valign="top" align="center">Polygonal</td>
<td valign="top" align="center">Gray</td>
</tr>
<tr>
<td valign="top" align="left">White</td>
<td valign="top" align="center">Tomato</td>
<td valign="top" align="center">Raised</td>
<td valign="top" align="center">Beans</td>
<td valign="top" align="center">Square</td>
<td valign="top" align="center">Radiating</td>
</tr>
<tr>
<td valign="top" align="left">Spotty</td>
<td valign="top" align="center">Obverse</td>
<td valign="top" align="center">Inward</td>
<td valign="top" align="center">Petiole</td>
<td valign="top" align="center">Regular</td>
<td valign="top" align="center">Hair</td>
</tr>
<tr>
<td valign="top" align="left">Melatonin</td>
<td valign="top" align="center">Brown</td>
<td valign="top" align="center">Pattern</td>
<td valign="top" align="center">Sized</td>
<td valign="top" align="center">Dark</td>
<td valign="top" align="center">Evenly</td>
</tr>
<tr>
<td valign="top" align="left">Middle</td>
<td valign="top" align="center">Ring</td>
<td valign="top" align="center">Shrink</td>
<td valign="top" align="center">Sporadic</td>
<td valign="top" align="center">Rectangular</td>
<td valign="top" align="center">Sides</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption><p>Clustering results of text nodes. <bold>(A)</bold> One-layer GCN. <bold>(B)</bold> Two-layer GCN.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-12-731688-g009.tif"/>
</fig>
<p>It can be seen from <xref ref-type="table" rid="T6">Table 6</xref> that most of the five feature words with the highest correlation to each disease category that were obtained by the GCN could correctly represent the feature of the corresponding disease category, but there were also some non-feature words with weak correlation. However, in general, the graph structure text could provide knowledge information for the disease image recognition process, and guide the model training to a certain extent. <xref ref-type="fig" rid="F9">Figure 9</xref> compares the effects of one-layer GCN and two-layer GCN in text node clustering. It can be seen that the clustering effect of two-layer GCN was significantly advantageous to that of one-layer GCN. This is consistent with the recognition outcomes as shown in <xref ref-type="table" rid="T5">Table 5</xref>.</p>
</sec>
<sec id="S3.SS4">
<title>Comparison of Fusion Models</title>
<p>In the image branch, the feature decomposition and recombination method was applied study to improve the robustness of the model, but the actual application effect differed for different choices of the basic network structure. On the test set with the same disease severity, DenseNet169 achieved the best recognition accuracy, but on the test set with different disease severities, ResNet18 achieved the best performance. Moreover, in both test sets, the performance of ResNet18 was improved after applying the feature decomposition and recombination method. Therefore, in this study, ResNet18 was chosen as the convolutional neural network. In the graph structure branch, as the two-layer GCN appeared to be more suitable than the one-layer and three-layer networks for the graph structure based on disease knowledge, the two-layer GCN was used. The test results of the fusion model are shown in <xref ref-type="table" rid="T7">Table 7</xref>.</p>
<table-wrap position="float" id="T7">
<label>TABLE 7</label>
<caption><p>Test results of the fusion model.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td/>
<td valign="top" align="center">Accuracy/%</td>
<td valign="top" align="center">Precision/%</td>
<td valign="top" align="center">Sensitivity/%</td>
<td valign="top" align="center">Specificity/%</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Tomato powdery mildew</td>
<td valign="top" align="center">97.25</td>
<td valign="top" align="center">86.67</td>
<td valign="top" align="center">99.34</td>
<td valign="top" align="center">96.30</td>
</tr>
<tr>
<td valign="top" align="left">Tomato early blight</td>
<td valign="top" align="center">100</td>
<td valign="top" align="center">100</td>
<td valign="top" align="center">100</td>
<td valign="top" align="center">100</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber powdery mildew</td>
<td valign="top" align="center">95.60</td>
<td valign="top" align="center">91.67</td>
<td valign="top" align="center">96.58</td>
<td valign="top" align="center">86.84</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber virus disease</td>
<td valign="top" align="center">98.35</td>
<td valign="top" align="center">96.55</td>
<td valign="top" align="center">98.69</td>
<td valign="top" align="center">93.33</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber downy mildew</td>
<td valign="top" align="center">96.70</td>
<td valign="top" align="center">94.44</td>
<td valign="top" align="center">97.26</td>
<td valign="top" align="center">89.47</td>
</tr>
<tr>
<td valign="top" align="left">Cucumber bacterial spot</td>
<td valign="top" align="center">97.80</td>
<td valign="top" align="center">87.50</td>
<td valign="top" align="center">99.37</td>
<td valign="top" align="center">95.45</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Average</bold></td>
<td valign="top" align="center"><bold>97.62</bold></td>
<td valign="top" align="center"><bold>92.81</bold></td>
<td valign="top" align="center"><bold>98.54</bold></td>
<td valign="top" align="center"><bold>93.57</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p><italic>The bold terms and values represent the average values of the performance metrics of model recognition on the data of different disease categories.</italic></p></fn>
</table-wrap-foot>
</table-wrap>
<p>According to <xref ref-type="table" rid="T2">Tables 2</xref>, <xref ref-type="table" rid="T5">5</xref>, <xref ref-type="table" rid="T7">7</xref>, the accuracy, precision, sensitivity and specificity of the fusion model were improved to varying degrees for all disease categories. In terms of accuracy, the fusion model was improved by about 3% compared with the original ResNet18 model, and by about 15% compared with the two-layer graph neural network. Therefore, it can be concluded that the fusion model not only learned the visual features in the image, but also made corrections on the recognition results according to the non-image features in the graph structure. As a result, it achieved the best outcome.</p>
</sec>
</sec>
<sec id="S4" sec-type="discussion">
<title>Discussion</title>
<p>Aiming at the problem of weak robustness of conventional feature extraction networks to datasets with different disease severities, a feature decomposition and recombination method was proposed in this study, which improved the robustness of the original feature extraction network to a certain extent. However, the effect of this method differed for different network structures. Generally speaking, it could derive an ideal model on the dataset with the same disease severity. On the dataset with different disease severities, its performance still maintained at a high level, though showing a certain degree of decrease. Therefore, the future research should consider how to improve the robustness of feature extraction models with different structures. In view of that conventional disease recognition methods lack the use of other expressions of disease visual factors, this study proposed to use GCN to train the visual disease text description information, and a graph structure disease dataset was built. However, this dataset is static. With the continuous increase of disease information, new datasets need to be built in the future. Thus, the follow-up research should consider using the dynamic graph neural network training method for optimization.</p>
</sec>
<sec id="S5" sec-type="conclusion">
<title>Conclusion</title>
<p>Conventional disease recognition methods lack the use of modal information other than the image modality. In the present study, the disease text description information represented by continuous vectors was decomposed and recombined into graph structure data. For image data, the feature decomposition was implemented by randomly disarranging and recombining the image blocks after segmentation, which improved the robustness of the model to a certain extent. Specifically, the accuracy, precision, sensitivity and specificity of the fusion model were 97.62, 92.81, 98.54, and 93.57%, respectively. This research provides new ideas for disease recognition, and puts forward new insights and methodology in improving the robustness of disease recognition models.</p>
</sec>
<sec id="S6" sec-type="data-availability">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="S7">
<title>Author Contributions</title>
<p>CW: writing original draft. JZ: software and validation. YZ: data curation. HW: methodology, and writing &#x2013; review and editing. CZ: writing &#x2013; review and editing, and supervision. GT: investigation. JL: visualization. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="conf1" sec-type="COI-statement">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="pudiscl1" sec-type="disclaimer">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<sec id="S8" sec-type="funding-information">
<title>Funding</title>
<p>This work was supported in part by the National Key Research and Development Program of China under Grant 2020YFD1100602, and in part by the Hebei Province Key Research and Development Program under Grants 20327402D and 19227210D, and in part by the National Natural Science Foundation of China under Grant 61871041, and in part by the Project of Introducing Overseas Students in Hebei Province under Grant C20190340. Research project of basic scientific research business expenses of provincial colleges and universities in Hebei Province under Grant KY202004.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chattopadhay</surname> <given-names>A.</given-names></name> <name><surname>Sarkar</surname> <given-names>A.</given-names></name> <name><surname>Howlader</surname> <given-names>P.</given-names></name> <name><surname>Balasubramanian</surname> <given-names>V.</given-names></name></person-group> (<year>2018</year>). &#x201C;<article-title>Grad-cam++: Generalized gradient-based visual explanations for deep convolutional networks</article-title>,&#x201D; in <source><italic>2018 IEEE Winter Conference on Applications of Computer Vision (WACV)</italic></source>, (<publisher-name>IEEE</publisher-name>), <fpage>839</fpage>&#x2013;<lpage>847</lpage>. <pub-id pub-id-type="doi">10.1109/WACV.2018.00097</pub-id></citation></ref>
<ref id="B2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name> <name><surname>Nanehkarana</surname> <given-names>Y. A.</given-names></name></person-group> (<year>2020</year>). <article-title>Using deep transfer learning for image-based plant disease identification[J].</article-title> <source><italic>Comp. Elect. Agricult.</italic></source> <volume>173</volume>:<issue>105393</issue>. <pub-id pub-id-type="doi">10.1016/j.compag.2020.105393</pub-id></citation></ref>
<ref id="B3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Z. M.</given-names></name> <name><surname>Wei</surname> <given-names>X. S.</given-names></name> <name><surname>Wang</surname> <given-names>P.</given-names></name> <name><surname>Guo</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). &#x201C;<article-title>Multi-label image recognition with graph convolutional networks</article-title>,&#x201D; in <source><italic>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</italic></source>, (<publisher-name>IEEE</publisher-name>), <fpage>5177</fpage>&#x2013;<lpage>5186</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2019.00532</pub-id></citation></ref>
<ref id="B4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Eitrich</surname> <given-names>T.</given-names></name> <name><surname>Kless</surname> <given-names>A.</given-names></name> <name><surname>Druska</surname> <given-names>C.</given-names></name> <name><surname>Grotendorst</surname> <given-names>J.</given-names></name></person-group> (<year>2007</year>). <article-title>Classification of Highly Unbalanced CYP450 Data of Drugs Using Cost Sensitive Machine Learning Techniques[J].</article-title> <source><italic>Cheminform</italic></source> <volume>38</volume> <fpage>92</fpage>&#x2013;<lpage>103</lpage>. <pub-id pub-id-type="doi">10.1021/ci6002619</pub-id> <pub-id pub-id-type="pmid">17238253</pub-id></citation></ref>
<ref id="B5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ferentinos</surname> <given-names>K. P.</given-names></name></person-group> (<year>2018</year>). <article-title>Deep learning models for plant disease detection and diagnosis[J].</article-title> <source><italic>Comp. Electr. Agricult.</italic></source> <volume>145</volume> <fpage>311</fpage>&#x2013;<lpage>318</lpage>. <pub-id pub-id-type="doi">10.1016/j.compag.2018.01.009</pub-id></citation></ref>
<ref id="B6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fuentes</surname> <given-names>A.</given-names></name> <name><surname>Yoon</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Park</surname> <given-names>D. S.</given-names></name></person-group> (<year>2017</year>). <article-title>A Robust Deep-Learning-Based Detector for Real-Time Tomato Plant Diseases and Pests Recognition.</article-title> <source><italic>Sen.-Bas.</italic></source> <volume>9</volume>:<issue>2022</issue>. <pub-id pub-id-type="doi">10.3390/s17092022</pub-id> <pub-id pub-id-type="pmid">28869539</pub-id></citation></ref>
<ref id="B7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Geetharamani</surname> <given-names>G.</given-names></name> <name><surname>Pandian</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>Identification of plant leaf diseases using a nine-layer deep convolutional neural network[J].</article-title> <source><italic>Comp. Elect. Eng.</italic></source> <volume>76</volume> <fpage>323</fpage>&#x2013;<lpage>338</lpage>. <pub-id pub-id-type="doi">10.1016/j.compeleceng.2019.04.011</pub-id></citation></ref>
<ref id="B8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>X.</given-names></name> <name><surname>Peng</surname> <given-names>Y.</given-names></name></person-group> (<year>2017</year>). &#x201C;<article-title>Fine-grained image classification via combining vision and language</article-title>,&#x201D; in <source><italic>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</italic></source>, (<publisher-name>IEEE</publisher-name>).<volume>Vol. 2017</volume> <fpage>5994</fpage>&#x2013;<lpage>6002</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2017.775</pub-id></citation></ref>
<ref id="B9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>X.</given-names></name> <name><surname>Peng</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). <article-title>Fine-grained visual-textual representation learning[J].</article-title> <source><italic>arXiv</italic></source> <volume>30</volume> <fpage>520</fpage>&#x2013;<lpage>531</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2019.2892802</pub-id></citation></ref>
<ref id="B10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>G.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Laurens</surname> <given-names>V.</given-names></name> <name><surname>Weinberger</surname> <given-names>K. Q.</given-names></name></person-group> (<year>2016</year>). &#x201C;<article-title>Densely Connected Convolutional Networks</article-title>,&#x201D; in <source><italic>IEEE Computer Society. IEEE Computer Society</italic></source>, (<publisher-name>IEEE</publisher-name>), <fpage>4700</fpage>&#x2013;<lpage>4708</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2017.243</pub-id></citation></ref>
<ref id="B11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jie</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>S.</given-names></name> <name><surname>Gang</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>G.</given-names></name> <name><surname>Wu</surname> <given-names>E.</given-names></name></person-group> (<year>2017</year>). &#x201C;<article-title>Squeeze-and-Excitation Networks</article-title>,&#x201D; in <source><italic>IEEE Transactions on Pattern Analysis and Machine Intelligence</italic></source>, (<publisher-name>IEEE</publisher-name>), <fpage>99</fpage>.</citation></ref>
<ref id="B12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kip</surname> <given-names>F. T. N.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2016</year>). <article-title>Semi-Supervised Classification with Graph Convolutional Networks.</article-title> <source><italic>arXiv</italic></source> <volume>1609</volume>:<issue>02907</issue>.</citation></ref>
<ref id="B13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J. H.</given-names></name> <name><surname>Lin</surname> <given-names>L. J.</given-names></name> <name><surname>Tian</surname> <given-names>K.</given-names></name> <name><surname>Alaa</surname> <given-names>A. A.</given-names></name></person-group> (<year>2020</year>). <article-title>Detection of leaf diseases of balsam pear in the field based on improved Faster R-CNN.</article-title> <source><italic>Transac. Chin. Soc. Agricult. Eng.</italic></source> <volume>36</volume> <fpage>179</fpage>&#x2013;<lpage>185</lpage>.</citation></ref>
<ref id="B14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Q.</given-names></name> <name><surname>Han</surname> <given-names>Z.</given-names></name> <name><surname>Wu</surname> <given-names>X. M.</given-names></name></person-group> (<year>2018</year>). &#x201C;<article-title>Deeper insights into graph convolutional networks for semi-supervised learning</article-title>,&#x201D; in <source><italic>Proceedings of the AAAI Conference on Artificial Intelligence</italic></source>, <volume>Vol. 32</volume> (<publisher-loc>Palo Alto</publisher-loc>: <publisher-name>AAAI Press</publisher-name>), <fpage>1</fpage>.</citation></ref>
<ref id="B15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Guo</surname> <given-names>R. A.</given-names></name></person-group> (<year>2020</year>). <article-title>solanaceae disease recognition model based on SE-Inception[J].</article-title> <source><italic>Comp. Electr. Agricult.</italic></source> <volume>178</volume>:<issue>105792</issue>. <pub-id pub-id-type="doi">10.1016/j.compag.2020.105792</pub-id></citation></ref>
<ref id="B16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Menardi</surname> <given-names>G.</given-names></name> <name><surname>Torelli</surname> <given-names>N.</given-names></name></person-group> (<year>2012</year>). <article-title>Training and assessing classification rules with unbalanced data[J].</article-title> <source><italic>Data Min. Knowl. Discov.</italic></source> <volume>28</volume> <fpage>92</fpage>&#x2013;<lpage>122</lpage>. <pub-id pub-id-type="doi">10.1007/s10618-012-0295-5</pub-id></citation></ref>
<ref id="B17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mohanty</surname> <given-names>S. P.</given-names></name> <name><surname>Hughes</surname> <given-names>D. P.</given-names></name> <name><surname>Salathe</surname> <given-names>M.</given-names></name></person-group> (<year>2016</year>). <article-title>Using Deep Learning for Image-Based Plant Disease Detection[J].</article-title> <source><italic>Front. Plant Sci.</italic></source> <volume>7</volume>:<fpage>1419</fpage>&#x2013;<lpage>1419</lpage>. <pub-id pub-id-type="doi">10.3389/fpls.2016.01419</pub-id> <pub-id pub-id-type="pmid">27713752</pub-id></citation></ref>
<ref id="B18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Reed</surname> <given-names>S.</given-names></name> <name><surname>Akata</surname> <given-names>Z.</given-names></name> <name><surname>Lee</surname> <given-names>H.</given-names></name> <name><surname>Schiele</surname> <given-names>B.</given-names></name></person-group> (<year>2016</year>). &#x201C;<article-title>Learning Deep Representations of Fine-Grained Visual Descriptions</article-title>,&#x201D; in <source><italic>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</italic></source>, (<publisher-name>IEEE</publisher-name>). <pub-id pub-id-type="doi">10.1109/CVPR.2016.13</pub-id></citation></ref>
<ref id="B19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Szegedy</surname> <given-names>C.</given-names></name> <name><surname>Vanhoucke</surname> <given-names>V.</given-names></name> <name><surname>Ioffe</surname> <given-names>S.</given-names></name> <name><surname>Shlens</surname> <given-names>J.</given-names></name> <name><surname>Wojna</surname> <given-names>Z.</given-names></name></person-group> (<year>2016</year>). &#x201C;<article-title>Rethinking the Inception Architecture for Computer Vision</article-title>,&#x201D; in <source><italic>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</italic></source>, (<publisher-name>IEEE</publisher-name>), <fpage>2818</fpage>&#x2013;<lpage>2826</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.308</pub-id></citation></ref>
<ref id="B20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Too</surname> <given-names>E. C.</given-names></name> <name><surname>Yujian</surname> <given-names>L.</given-names></name> <name><surname>Njuki</surname> <given-names>S.</given-names></name> <name><surname>Yingchun</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>A comparative study of fine-tuning deep learning models for plant disease identification.</article-title> <source><italic>Comp. Electr. Agricult.</italic></source> <volume>2018</volume> <fpage>272</fpage>&#x2013;<lpage>279</lpage>. <pub-id pub-id-type="doi">10.1016/j.compag.2018.03.032</pub-id></citation></ref>
<ref id="B21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Zhou</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>C.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <article-title>Few-shot vegetable disease recognition model based on image text collaborative representation learning[J].</article-title> <source><italic>Comp. Electr. Agricult.</italic></source> <volume>184</volume>:<issue>106098</issue>. <pub-id pub-id-type="doi">10.1016/j.compag.2021.106098</pub-id></citation></ref>
<ref id="B22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>G.</given-names></name> <name><surname>Yu</surname> <given-names>S.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>Automatic Image-Based Plant Disease Severity Estimation Using Deep Learning.</article-title> <source><italic>Comput. Intellig. Neurosci.</italic></source> <volume>2017</volume>:<issue>2917536</issue>. <pub-id pub-id-type="doi">10.1155/2017/2917536</pub-id> <pub-id pub-id-type="pmid">28757863</pub-id></citation></ref>
<ref id="B23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>S.</given-names></name> <name><surname>Feng</surname> <given-names>Q.</given-names></name> <name><surname>Zhang</surname> <given-names>J. H.</given-names></name></person-group> (<year>2020</year>). <article-title>Identification Method for Potato Disease Based on Deep Learning and Composite Dictionary.</article-title> <source><italic>Transact. Chin. Soc. Agricult. Mach.</italic></source> <volume>7</volume> <fpage>22</fpage>&#x2013;<lpage>29</lpage>.</citation></ref>
<ref id="B24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>L.</given-names></name> <name><surname>Mao</surname> <given-names>C.</given-names></name> <name><surname>Luo</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). &#x201C;<article-title>convolutional networks for text classification</article-title>,&#x201D; in <source><italic>Proceedings of the AAAI Conference on Artificial Intelligence</italic></source>, (<publisher-loc>Palo Alto</publisher-loc>: <publisher-name>AAAI Press</publisher-name>). <volume>33</volume> <fpage>7370</fpage>&#x2013;<lpage>7377</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v33i01.33017370</pub-id></citation></ref>
<ref id="B25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhong</surname> <given-names>Y.</given-names></name> <name><surname>Zhao</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>Research on deep learning in apple leaf disease recognition[J].</article-title> <source><italic>Comp. Electr. Agricult.</italic></source> <volume>168</volume>:<issue>105146</issue>. <pub-id pub-id-type="doi">10.1016/j.compag.2019.105146</pub-id></citation></ref>
</ref-list>
</back>
</article>
