<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Physiol.</journal-id>
<journal-title>Frontiers in Physiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Physiol.</abbrev-journal-title>
<issn pub-type="epub">1664-042X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">847267</article-id>
<article-id pub-id-type="doi">10.3389/fphys.2022.847267</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Weakly Supervised Deep Learning for Tooth-Marked Tongue Recognition</article-title>
<alt-title alt-title-type="left-running-head">Zhou et al.</alt-title>
<alt-title alt-title-type="right-running-head">Weakly Supervised Tooth-Marked Tongue Recognition</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhou</surname>
<given-names>Jianguo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Shangxuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Xuesong</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Zizhu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hou</surname>
<given-names>Xinyuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1697789/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lai</surname>
<given-names>Wei</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Shifeng</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1359349/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Deng</surname>
<given-names>Qingqiong</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhou</surname>
<given-names>Wu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1330404/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Medical Information Engineering</institution>, <institution>Guangzhou University of Chinese Medicine</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Artificial Intelligence</institution>, <institution>Beijing Normal University</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Beijing Yikang Medical Technology Co., Ltd.</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/59713/overview">Eun Bo Shim</ext-link>, Kangwon National University, South Korea</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1662942/overview">Hee-Jeong Jin</ext-link>, Korea Institute of Oriental Medicine (KIOM), South Korea</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1674697/overview">Bob Zhang</ext-link>, University of Macau, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Qingqiong Deng, <email>qqdeng@bnu.edu.cn</email>; Wu Zhou, <email>zhouwu@gzucm.edu.cn</email>
</corresp>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this work and share first authorship</p>
</fn>
<fn fn-type="other">
<p>This article was submitted to Computational Physiology and Medicine, a section of the journal Frontiers in Physiology</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>12</day>
<month>04</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>847267</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>01</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>08</day>
<month>03</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Zhou, Li, Wang, Yang, Hou, Lai, Zhao, Deng and Zhou.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Zhou, Li, Wang, Yang, Hou, Lai, Zhao, Deng and Zhou</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>The recognition of tooth-marked tongues has important value for clinical diagnosis of traditional Chinese medicine. Tooth-marked tongue is often related to spleen deficiency, cold dampness, sputum, effusion, and blood stasis. The clinical manifestations of patients with tooth-marked tongue include loss of appetite, borborygmus, gastric distention, and loose stool. Traditional clinical tooth-marked tongue recognition is conducted subjectively based on the doctor&#x2019;s visual observation, and its performance is affected by the doctor&#x2019;s subjectivity, experience, and environmental lighting changes. In addition, the tooth marks typically have various shapes and colors on the tongue, which make it very challenging for doctors to identify tooth marks. The existing methods based on deep learning have made great progress for tooth-marked tongue recognition, but there are still shortcomings such as requiring a large amount of manual labeling of tooth marks, inability to detect and locate the tooth marks, and not conducive to clinical diagnosis and interpretation. In this study, we propose an end-to-end deep neural network for tooth-marked tongue recognition based on weakly supervised learning. Note that the deep neural network only requires image-level annotations of tooth-marked or non-tooth marked tongues. In this method, a deep neural network is trained to classify tooth-marked tongues with the image-level annotations. Then, a weakly supervised tooth-mark detection network (WSTDN) as an architecture variant of the pre-trained deep neural network is proposed for the tooth-marked region detection. Finally, the WSTDN is re-trained and fine-tuned using only the image-level annotations to simultaneously realize the classification of the tooth-marked tongue and the positioning of the tooth-marked region. Experimental results of clinical tongue images demonstrate the superiority of the proposed method compared with previously reported deep learning methods for tooth-marked tongue recognition. The proposed tooth-marked tongue recognition model may provide important syndrome diagnosis and efficacy evaluation methods, and contribute to the understanding of ethnopharmacological mechanisms.</p>
</abstract>
<kwd-group>
<kwd>traditional Chinese medicine</kwd>
<kwd>tooth-marked tongue</kwd>
<kwd>deep learning</kwd>
<kwd>weakly supervised learning</kwd>
<kwd>tongue diagnosis</kwd>
<kwd>convolutional neural network</kwd>
</kwd-group>
<contract-num rid="cn001">2017YFC1700106</contract-num>
<contract-sponsor id="cn001">National Key Research and Development Program of China<named-content content-type="fundref-id">10.13039/501100012166</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Tongue diagnosis is one of the most important diagnostic methods in Chinese medicine. The characteristics of a tongue, such as shape and color, can reflect the internal health of the body, and the severity or progression of the disease. By observing the characteristics of a tongue, Chinese medicine can distinguish the clinical symptoms and choose appropriate treatment strategies (<xref ref-type="bibr" rid="B21">Zhang et al., 2017</xref>). As one of the most important tongue features, tooth marks are generally formed by the compression of the fatter tongue by adjacent teeth. <xref ref-type="fig" rid="F1">Figure 1</xref> shows some representative tooth-marked and non-tooth marked tongue images. Typically, tooth-marked tongue refers to a kind of abnormal tongue shape in which the tongue body is fat in different degrees and is compressed by the teeth, and the edge of the tongue body is formed with tooth marks that are serrated. Non-tooth-marked tongue tends to be moderately fat and thin, and the edges of the tongue are continuous and smooth. According to the theory of traditional Chinese medicine, tooth-marked tongue is often related to spleen deficiency, cold dampness, sputum, effusion, and blood stasis. The clinical manifestations of patients with tooth-marked tongue include loss of appetite, borborygmus, gastric distention, and loose stool (<xref ref-type="bibr" rid="B10">Li and Dong, 2017</xref>). Hence, the recognition of tooth-marked tongues has important value for clinical diagnosis of Chinese medicine. However, the routine clinical recognition of tooth-marked tongues is through the doctor&#x2019;s visual observation, and its performance is limited by the doctor&#x2019;s subjectivity and experience, in addition to environmental lighting changes. In addition, there are different types of tooth marks, including different colors and varied shapes, which make it challenging for doctors to identify. Therefore, the study of objective tooth-marked tongue recognition based on image data has important clinical value.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Representative tongue images. <bold>(A)</bold> Tooth-marked tongue with a very obvious contour distortion along both sides of the tongue, accompanied by the color change of the extruded area of the tongue; <bold>(B)</bold> Non-tooth marked tongue. The tongue body is flat, theperipheral contour is regular, and there is no contour distortion and color change area; <bold>(C)</bold> Suspicious tooth-marked tongue. The identification is controversial because the tongue body is flat and the peripheral contour is not distorted. Finally, the tooth-marked tongue is determined by the color change of the extruded area of the tongue edge.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g001.tif"/>
</fig>
<p>In recent years, researchers have been trying to establish an objective tooth-marked tongue recognition model based on digital image processing and analysis. Most studies are based on the local color and unevenness of the tooth scar area. <xref ref-type="bibr" rid="B8">Hsu et al. (2010)</xref> analyzed the RGB color composition based on the image of the tongue region and found that the G chromatogram of the tooth marks is lower than that of the tongue body and tongue surface. <xref ref-type="bibr" rid="B12">Lo et al. (2012)</xref> found that different imaging angles or the degree of tongue extrusion would affect the judgment of tooth marks. <xref ref-type="bibr" rid="B11">Li et al. (2019)</xref> used concavity information to generate suspicious tooth-marked areas for the following classification of the tooth-marked tongue. However, due to the great color difference of tongues and varied shape of tooth marks, the recognition based on color and shape of tooth marks usually has low robustness and stability.</p>
<p>With the continuous development of artificial intelligence and deep learning, the convolutional neural network (CNN) model is applied to tongue analysis. <xref ref-type="bibr" rid="B25">Xu et al. (2020)</xref> used multi-task learning with deep learning to realize tongue segmentation and tongue coating classification, and achieved better results than single task. <xref ref-type="bibr" rid="B26">Jiang et al. (2021)</xref> proposed toassess the tongue image quality based on a deep CNN. <xref ref-type="bibr" rid="B27">Hu et al. (2021)</xref> proposed a method for automatic construction of Chinese herbal prescriptions from tongue images using CNN and auxiliary latent therapy topics. Typically, current identification of the tooth-marked tongue generally extracts the entire tongue image, and then directly classifies the tooth-marked tongue based on CNN (<xref ref-type="bibr" rid="B15">Sun et al., 2019</xref>; <xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>). Specifically, <xref ref-type="bibr" rid="B15">Sun et al. (2019)</xref> proposed a 7-layer CNN model that takes the tongue image as the input to identify the tooth-marked tongue, with an accuracy of 78.6%. <xref ref-type="bibr" rid="B19">Wang et al. (2020)</xref> classified the tooth-marked tongue based on a deeper CNN network, and showed that their method could achieve promising results of the tooth-marked tongue recognition. However, since the center, tip, and base of the tongue are not informative for the identification of the tooth-marked tongue, incorporating these non-informative areas into the deep neural network for analysis may have a negative impact on the performance of the model (<xref ref-type="bibr" rid="B2">Chong Wang et al., 2015</xref>). In addition, such tooth-marked tongue classification only provides the image-level identification of the tooth-marked tongues, and does not provide the specific location of the tooth marks, which is not conducive to assisting clinical diagnosis and interpretability.</p>
<p>Since tooth marks are one of the symptoms of a tongue, the identification of tooth-marked tongues has been regarded as a fine-grained classification problem, and the classification of tooth-marked tongue can be conducted by multiple instance learning (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>). Specifically, multiple candidates of tooth-marked areas are generated on the tongue body. If all candidates are non-tooth marks, it is a non-tooth marked tongue. On the contrary, if one candidate on the tongue is a tooth mark, it is considered as a tooth-marked tongue. <xref ref-type="bibr" rid="B11">Li et al. (2019)</xref> first used concavity information to generate candidates of tooth-marked areas, followed by extracting CNN deep features from these areas, and finally classified the features based on a multi-instance support vector machine (MiSVM) to classify the tooth-marked tongue. Although such pioneer work has obtained promising results for the tooth-marked tongue recognition, it has several shortcomings. First, it uses concavity information to generate candidate areas of the tongue, non-tooth marked tongues rarely have such concavity information, which makes it difficult to achieve a unified generation of candidate examples of tooth-marked and non-tooth marked tongues. Then, it requires a large amount of tooth mark examples for feature extraction, which will bring a lot of labor costs. Furthermore, deep features are extracted through the CNN model for each candidate tooth-marked region, which will also bring a lot of computational cost. Finally, this process is not an end-to-end deep neural network and cannot provide a discriminative location of the tooth-marked area, which is not conducive to assisting clinical diagnosis and interpretability. Recently, <xref ref-type="bibr" rid="B16">Tang et al. (2020)</xref> proposed a tongue region detection and tongue landmark detection <italic>via</italic> deep learning for tooth-marked tongue recognition. However, it requires a lot of image annotation including tongue landmark annotation and tongue region annotation, which is a huge burden and tedious work for clinic. <xref ref-type="bibr" rid="B28">Weng et al. (2021)</xref> proposed a weakly supervised tooth-mark detection method using the YOLO object detection model. However, it requires fully bounding-box level annotation of tooth marks in addition to coarse image-level annotation of tooth-marked tongue images. These are very tedious work for clinical application.</p>
<p>In this study, a weakly supervised object detection using deep learning is proposed for the tooth-marked tongue recognition, where only image-level labels are used for model training. The proposed method is motivated by the work of weakly supervised deep detection network in computer vision (<xref ref-type="bibr" rid="B1">Bilen and Vedaldi, 2016</xref>), in which a CNN pre-trained for image classification on a large dataset ImageNet is modified to reason efficiently about regions, branching off a recognition, and a detection data streams. The resulting architecture can be fine-tuned on a target dataset to achieve state-of-the-art weakly supervised object detection using only image-level annotations. Based on this consideration, the tooth-marked tongue recognition can be naturally solved from the perspective of weakly supervised object detection for two reasons. First, the detection and localization of tooth-marked areas are conducive to assisting clinical diagnosis and interpretability. Second, only image-level annotation without tooth marks labeling can significantly reduce the cost of data annotation. In addition, Zhou et al. proposed that even when there is no supervision of the target position, the convolution unit of the convolution layer can be regarded as the target detector (<xref ref-type="bibr" rid="B23">Zhou et al., 2015</xref>). Therefore, the classification network of tooth-marked tongues with only image-level annotations makes it possible to locate tooth marks without providing tooth mark annotations.</p>
<p>To this end, we propose an end-to-end deep neural network for tooth-marked tongue recognition based on weakly supervised learning. To improve the efficiency and reliability of the generation of the candidate tooth-marked areas, we use the prior knowledge of the tooth marks distribution to generate the candidate tooth-marked areas through the position information. In addition, to avoid the labeling of a large number of examples of tooth marks for deep learning, we propose a weakly supervised learning method for tooth-marked tongue recognition. Specifically, we first train a deep neural network model to classify tooth-marked tongues with the image-level annotations, and then we propose a weakly supervised tooth-mark detection network (WSTDN) as an architecture variant of the pre-trained deep neural network for the tooth-marked region detection, followed by fine-tuning the WSTDN once again using only the image-level annotations to simultaneously realize the classification of the tooth-marked tongue and the positioning of the tooth-marked region.</p>
<p>Compared to the existing works, the main contributions of the present work are summarized as follows: 1) we propose an end-to-end deep neural network for tooth-marked tongue recognition based on weakly supervised learning, avoiding manual labeling, and screening of a large number of tooth-marked examples; 2) we propose a novel method for generating candidate regions based on prior knowledge of tooth mark distribution to improve the efficiency of tongue tooth-marked candidate region generation; 3) in the case of only image-level labels, we propose the WSTDN to realize the classification of the tooth-marked tongue and the positioning of the tooth-marked area at the same time, which is convenient for assisting clinical diagnosis and interpretation.</p>
</sec>
<sec id="s2">
<title>2 Materials and Methods</title>
<sec id="s2-1">
<title>2.1 Clinical Data</title>
<p>The study was approved by the local ethics committee, and the patient signed the informed consent form (IRB:2019BZHYLL0101). We used standard equipment designed by Shanghai Daoshi Medical Technology Co., Ltd. (DS01-B) to obtain tongue images from patients in the local institute. Then, we transferred the images to a workstation for clinical evaluation. Three Traditional Chinese Medicine (TCM) physicians with two to 5&#xa0;years clinical experience distinguished tongue images into tooth-marked tongue or non-tooth marked tongue. All professionals are well-trained and have normal vision. The TCM clinical criteria for diagnosing tooth-marked tongues are as follows: First, observe whether there are jagged tooth marks caused by teeth pressing on the tongue on both sides of the tongue; secondly, for the tongue with inconspicuous jagged tooth marks, observe the color depth of the suspected area, in which the compressed tooth scar area typically has a darker color (<xref ref-type="bibr" rid="B16">Tang et al., 2020</xref>). The detailed evaluation procedure of this study consists of three steps. First, professionals discussed and acknowledged the diagnostic criteria for tooth-marked tongue. Secondly, a professional classified all 330 tongue images for identifying tooth-marked tongues, and two other professionals reviewed the classification results separately. In the case of disagreement, three professionals would discuss and make a final decision. By dividing the number of inconsistent samples reviewed by experts by the total number of samples, the inconsistency rate among experts is 14.8%. The main reasons for the inconsistency are the shadows caused by the influence of the light and the inconspicuous tooth marks on the tongue. However, in the second judgment after the discussion, opinions often reach an agreement. Let experts generate inconsistent samples, which are difficult samples. Removing these difficult samples from the training data reduces the generalization performance of the model because it only recognizes samples with obvious tooth marks. The data set after clinical screening contains 130 tooth-marked tongue images and 200 non-tooth marked images. It should be noted that in this study, the clinic only needs to provide image-level labels for the tongue image samples as tooth-marked or non-tooth marked tongues, and there is no need to provide the specific location and bounding boxes annotations of tooth-marks on the tongue. <xref ref-type="fig" rid="F2">Figure 2</xref> shows an overall pipeline for the proposed method.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Overview of the construction of the dataset and the main processing procedures of the proposed method. <bold>(A)</bold> Tongue images were captured with standard equipment. <bold>(B)</bold> Classification of tooth-marked and non-tooth-marked samples to construct the original tongue image dataset. <bold>(C)</bold> Tongue region was delineated to construct a tongue image dataset. <bold>(D)</bold> Pre-trained CNN model, fine-tuned WSTDN model with image-level labeled data, and image-level results output. <bold>(E)</bold> Performance with validation metrics.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g002.tif"/>
</fig>
</sec>
<sec id="s2-2">
<title>2.2 Data Preprocessing</title>
<p>First, we used the Labelme software (<ext-link ext-link-type="uri" xlink:href="http://labelme.csail.mit.edu/Release3.0/">http://labelme.csail.mit.edu/Release3.0/</ext-link>) to outline the tongue area, and then performed the AND operation on this area with the original image to extract the image of the entire tongue area. The purpose of extracting the tongue region was to shield the irrelevant face and the interference of the surrounding background of the tongue, so as not to affect the recognition performance of the model. The delineated tongue images were resized to 224 &#xd7; 224 before entering the network, which were used in training the deep neural network. We adopted a data augmentation method of random horizontal inversion, random rotation of 0&#x2013;15&#xb0;, and random vertical inversion for the tongue image in order to obtain more training data for training the deep neural network.</p>
</sec>
<sec id="s2-3">
<title>2.3 The Proposed Framework</title>
<p>The proposed method mainly includes three stages, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. First, in the pre-trained CNN module, we pre-train a CNN model with the weight initialization of ImageNet using image-level annotations to distinguish between tooth-marked and non-tooth marked tongues. Subsequently, we propose the WSTDN that uses the pre-trained CNN model as the backbone and add the spatial region proposal (SRP), spatial pyramid pool (SPP), Classification module, and Detection module to achieve the weakly supervised tooth-marked tongue recognition. Finally, we fine-tune the WSTDN with only the image-level annotations, simultaneously realizing the classification of the tooth-marked tongue and the positioning of the tooth-mark area. Each module will be introduced in the following subsections.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Framework of the proposed method. It includes a pre-trained CNN module; an SRP module for generating tooth mark candidate regions; an SPP-layer for obtaining and normalizing the deep features of tooth mark candidate regions; two weakly supervised branches, a classification module and a detection module; The hadamard operation of the branch results and summation yields the image-level classification results.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g003.tif"/>
</fig>
<sec id="s2-3-1">
<title>2.3.1 Pre-Trained Network</title>
<p>Our study is based on the premise that pre-trained CNN can be well generalized to a large number of tasks, as there is evidence that CNNs trained for image classification can bring proxies to object detection (<xref ref-type="bibr" rid="B23">Zhou et al., 2015</xref>). It is worth noting that these concepts are obtained implicitly without providing the network with information about the location of these structures in the image. Correspondingly, the CNN trained in tongue image classification may already implicitly contain most of the information needed to perform tooth-marked area detection. Therefore, we propose to train a CNN with the training data of tongue images and only image-level supervision (no bounding box annotations) for further tooth-marks detection. Note that the CNN has been pre-trained on ImageNet ILSVRC 2012 data (<xref ref-type="bibr" rid="B14">Russakovsky et al., 2015</xref>). In this study, we use the ResNet34 network (<xref ref-type="bibr" rid="B6">He et al., 2016</xref>), which is consistent with the previous study that has proved that ResNet34 is superior to other typical CNN models in tongue image classification (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>).</p>
<p>As shown in <xref ref-type="fig" rid="F4">Figure 4</xref>, the structure of ResNet34 is shown in the green dotted box, in which the two convolution layers are a group, and the residual calculation is conducted in the shortcut connection block as shown in the red dotted box. The size of tongue images and deep features are expressed as (batch size, width, height, and channel). The solid line residual arrow indicates that the input and output have the same dimension, and the dotted line residual arrow indicates that the input and output have different dimensions. Solid line &#x2295; is calculated as <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the dotted line &#x2295; is calculated as <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where f(x) represents the feature calculated by the yellow matrix, and W is the convolution operation, which is used to adjust the channel dimension of X. ReLU (<xref ref-type="bibr" rid="B13">Nair and Hinton, 2010</xref>) was used as the activation function. The pre-trained ResNet34 model will be used as the backbone to build the proposed WSTDN for the weakly supervised tooth-marked tongue recognition in the following section.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>SPP layer. The green cuboid represents the deep feature of the tooth mark candidate area, and R represents the number of tooth mark candidate areas. It transforms the (h, w, 512) features of the R tooth mark candidate regions into a unified (R, 4096) size, h and w separately represent the height and width of the tooth mark candidate region.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g004.tif"/>
</fig>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Weakly Supervised Tooth-Marked Tongue Detection Network</title>
<p>In order to achieve the objective of weakly supervised tooth-marked tongue recognition, we have made certain improvements based on the pre-trained ResNet34 model. First, we removed the avgpool layer and fc layer behind the last BN layer in ResNet34 (that is, the classifier layer, which is only used for feature extraction), and replaced it with a spatial pyramid pool (SPP) (<xref ref-type="bibr" rid="B7">He et al., 2014</xref>). We implemented SPP as a network layer (<xref ref-type="bibr" rid="B5">Girshick, 2015</xref>) to allow the system to be trained end-to-end and improve efficiency. By introducing SPP as the network layer, we only need the original tongue image to pass through the CNN network, and we can get a deep feature of (batchsize, 7, 7, 512). As shown in the following <xref ref-type="fig" rid="F5">Figure 5</xref> of the SPP layer, the candidate area is mapped to find the corresponding candidate feature area on the 7 &#xd7; 7 feature map. If the size of the candidate area is 32 &#xd7; 32, from the tongue image to the deep feature, a candidate area takes at least 1 &#xd7; 1 grid feature and at most 2 &#xd7; 2 grid features. The SPP network layer stretches candidate feature regions of different sizes to the same size, and then inputs them to the fully connected layer, so that feature maps are calculated first, and the results of the feature maps can be shared when each candidate region is represented, saving a lot of calculation time. At this time, in the network structure, the regional-level features are further processed by two fully connected layers, and each layer contains a linear map and an activation function ReLU. Inspired by the previous research on the weakly supervised detection network (<xref ref-type="bibr" rid="B1">Bilen and Vedaldi, 2016</xref>), we branched out from the output of the last layer of the SPP layer into two modules, a classification module and a detection module.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Architecture of the ResNet34 model. The size of tongue images and deep features are expressed as (batch size, width, height and channel). The solid line residual arrow indicates that the input and output have the same dimension, and the dotted line residual arrow indicates that the input and output have different dimensions.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g005.tif"/>
</fig>
</sec>
<sec id="s2-3-3">
<title>2.3.3 Candidate Region Generation Method Based on Location Information (Spatial Region Proposal, SRP)</title>
<p>The selection of tooth-marked candidate area is of great significance in tooth-marked tongue recognition. In order to generate candidate regions to use with our proposed network, we propose a novel method to select candidate areas with simple equidistant frames on both sides of the tongue. The proposed method comes from the doctor&#x2019;s clinical observation. When the doctor judges whether it is a tooth-marked tongue, the main focus is on the areas of both sides of the tongue. This method avoids the large-area overlap of the candidate areas, and it is also simple and efficient to unify the selection of the candidate area of the tooth-marked tongue and the non-tooth marked tongue. h candidate region is represented, saving a lot of calculation time.</p>
<p>
<xref ref-type="fig" rid="F6">Figure 6</xref> showed the process of candidate region generation method based on location information. First, we convert the tongue image (<xref ref-type="fig" rid="F6">Figure 6A</xref>) into a grayscale image (<xref ref-type="fig" rid="F6">Figure 6B</xref>), because we only need position information, so we do not need to consider color information. Converting into a grayscale image can greatly reduce our candidate region generation time. We fall from the top to the left side of the tongue, traverse from left to middle, and get the first non-zero point, which is recorded as the midpoint. We save the minimum x, minimum y, width, and height of the candidate region according to the midpoint (<xref ref-type="fig" rid="F6">Figure 6C</xref>). Then, we continue to traverse downward at equal intervals. This interval is set to 2/3 of the size of the candidate area, and the tongue is generally curved. Such a curvature can also give our candidate frame a certain horizontal displacement (<xref ref-type="fig" rid="F6">Figure 6D</xref>). The right side of the tongue is from top to bottom, from right to middle, using the same method to select candidate regions. We removed the first and last candidate regions on the left and right sides (<xref ref-type="fig" rid="F6">Figures 6E,F</xref>), because these two candidate regions are generally the base and tip of the tongue, which are not in the range of the tooth-marked tongue detection area. Finally, we obtain the candidate tooth-marked area on the color tongue image (<xref ref-type="fig" rid="F6">Figure 6G</xref>).</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Candidate region generation method based on location information. The arrow indicates the traversal direction. Take the left as an example, from top to bottom, from left to right, and the right side is the same symmetrical operation. The red point is the first non-zero value point traversed, as the midpoint of the tooth mark candidate area, (x<sub>min</sub>, y<sub>min</sub>) is the upper left corner of the tooth mark candidate area, h and w represent the tooth mark candidate area respectively height and width.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g006.tif"/>
</fig>
</sec>
<sec id="s2-3-4">
<title>2.3.4 Classification Module</title>
<p>As shown in the Classification module in <xref ref-type="fig" rid="F3">Figure 3</xref>, in order to discriminate the tooth-marked category of each candidate area, we make a linear mapping to the classification branch, and the output of this mapping is the category number <italic>C</italic>. Its definition is as follows (<xref ref-type="disp-formula" rid="e1">Eq. 1</xref>):<disp-formula id="e1">
<mml:math id="m3">
<mml:mrow>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mi>C</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the predicted scores on all classes in a certain area. Specifically, we calculate the index sum of <inline-formula id="inf4">
<mml:math id="m5">
<mml:mi>C</mml:mi>
</mml:math>
</inline-formula> categories of the same region box, and then divide the current element by the value (in our case, the number of categories <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Conducting the corresponding softmax transformation on the data in each column of <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> , which is equivalent to calculate the probability of tooth marks or non-tooth marks in a certain area.</p>
</sec>
<sec id="s2-3-5">
<title>2.3.5 Detection Module</title>
<p>As shown in the Detection module in <xref ref-type="fig" rid="F3">Figure 3</xref>, in order to obtain the scores of a certain class in all candidate regions, we make a linear mapping to the detection branch, and the mapping output is also the number of classes <inline-formula id="inf7">
<mml:math id="m8">
<mml:mi>C</mml:mi>
</mml:math>
</inline-formula>. Its definition is as follows <xref ref-type="disp-formula" rid="e2">Eq. 2</xref>:<disp-formula id="e2">
<mml:math id="m9">
<mml:mrow>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mi>d</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mi>d</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>R</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mi>d</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf8">
<mml:math id="m10">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mi>d</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the predicted scores on all regions of a certain category. Specifically, for the same category, we calculate the score of the current element relative to different region boxes. Corresponding softmax transformation is performed on the data of each row of <inline-formula id="inf9">
<mml:math id="m11">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mi>d</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which is equivalent to the score probability of a certain class in all regions.</p>
</sec>
<sec id="s2-3-6">
<title>2.3.6 Image-Level Classification Score</title>
<p>Since there is no real tooth-marked area and instance-level category labels for supervision, in the two branches of our model, the classification module predicts the tooth-marked category in a certain area, and the detection module selects which areas are more likely to contain the tooth-marked area. Therefore, the final score for each area is obtained by taking the product of the two score matrices (Hadamard) <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:msubsup>
<mml:mtext>x</mml:mtext>
<mml:mrow>
<mml:mtext>cr</mml:mtext>
</mml:mrow>
<mml:mtext>R</mml:mtext>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mtext>&#x3c3;</mml:mtext>
<mml:mrow>
<mml:mtext>class</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mtext>x</mml:mtext>
<mml:mtext>c</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mtext>&#x3c3;</mml:mtext>
<mml:mrow>
<mml:mtext>det</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mtext>x</mml:mtext>
<mml:mtext>d</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The score <inline-formula id="inf11">
<mml:math id="m13">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mi>R</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is summed to obtain the final classification score y<sub>c</sub>, which can be defined as follows <xref ref-type="disp-formula" rid="e3">Eq. 3</xref>:<disp-formula id="e3">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>R</mml:mi>
</mml:msubsup>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>It is worth noting that <inline-formula id="inf12">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the sum of the product of the elements of the softmax standardized value of the area &#x7c;R&#x7c;, so it is in the range of (0, 1). Finally, we use cross-entropy loss to calculate the loss of our predicted <inline-formula id="inf13">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the original image-level label.</p>
</sec>
</sec>
<sec id="s2-4">
<title>2.4 Implementation</title>
<p>The proposed weakly supervised tooth-marked tongue detection model was implemented using &#x201c;PyTorch&#x201d; (<ext-link ext-link-type="uri" xlink:href="http://pytorch.org">pytorch.org</ext-link>) and the Adam algorithm was used to minimize the objective function. Data augmentation was implemented with torchvision and image augmentation (<ext-link ext-link-type="uri" xlink:href="http://github.com/aleju/imgaug">github.com/aleju/imgaug</ext-link>). We used a NVIDIA TITAN RTX graphics card with 24G memory. The initialization of the learning rate was set to 1e-4 and the weight decay was set to 1e-4, batchsize was set to 32. The performance metrics of the computer are as follows: CPU is Intel(R) Xeon(R) Gold 5118. RAM is 64.0&#xa0;GB. GPU is NVIDIA TITAN RTX. Since the computation time per image is too short and inconsistent, we selected 20 tongue images, in which there are 10 tooth-marked images and 10 non-tooth-marked images, and the total time for generating the tooth mark candidate area is calculated to obtain the time of each picture. To alleviate the problem of overfitting, if the validation accuracy did not increase for 10&#xa0;epoch, an early stop was used to stop the optimization and save the model weight. Basic implementation code of the work can be available in GitHub, <ext-link ext-link-type="uri" xlink:href="https://github.com/Lsx0802/WSTMD">https://github.com/Lsx0802/WSTMD</ext-link>.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Experimental Results</title>
<sec id="s3-1">
<title>3.1 Experimental Settings, Evaluation Metrics, and Comparison Methods</title>
<p>We divided 330 tongue images into a training set and a validation set, using 5 times four folded cross validation. The performance of the model is evaluated by calculating the average value and variance of the evaluation metrics. The experimental results are evaluated by the following four metrics: (<xref ref-type="disp-formula" rid="e4">Eq. 4</xref>) Accuracy, (<xref ref-type="disp-formula" rid="e5">Eq. 5</xref>) Precision, (<xref ref-type="disp-formula" rid="e6">Eq. 6</xref>) Recall, (<xref ref-type="disp-formula" rid="e7">Eq. 7</xref>) F1 score.<disp-formula id="e4">
<mml:math id="m17">
<mml:mrow>
<mml:mtext>Accuracy</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
<disp-formula id="e5">
<mml:math id="m18">
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<disp-formula id="e6">
<mml:math id="m19">
<mml:mrow>
<mml:mtext>Recall</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
<disp-formula id="e7">
<mml:math id="m20">
<mml:mrow>
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>&#xa0;Score</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where TP, FP, TN, FN represent true positive, false positive, true negative, and false negative, respectively. Accuracy is the proportion of the sum of positive and negative cases correctly classified to all samples. Precision is the proportion of positive cases correctly classified to all positive cases predicted by the model. Recall is the proportion of positive cases correctly predicted by the model to all positive samples. F1 score is the balance index used to measure the accuracy of the classification model, which takes into account both precision and recall of the model and can be regarded as a harmonic average of model precision and recall. When the cost of false negative (FN) is very high (the consequences are very serious), and it needs to reduce FN as much as possible, so as to improve the recall index. Clinically, patients with tooth-marked tongues should be recognized for further treatment, so we want the model to have a higher recall value under similar accuracy conditions.</p>
<p>In addition, we output the candidate boxes whose scores of the model&#x2019;s candidate box &#x3c3;<sub>det</sub> are greater than (1/the number of candidate boxes) for visual observation. Finally, we compared the proposed method with the tooth-marked tongue recognition model using multi-instance SVM (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) and end-to-end convolutional network (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>). We also conducted different candidate region generation methods (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>; <xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) for comparison.</p>
</sec>
<sec id="s3-2">
<title>3.2 Performance Comparison of Different Methods</title>
<p>As tabulated in <xref ref-type="table" rid="T1">Table 1</xref>, the method of (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>) is directly based on ResNet34 network and tongue images for classification, and the distinguishing accuracy can reach 72.28%. After the weights initialization of ImageNet for the method of image (IPW), the accuracy can reach 79.40%. The method of (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) can achieve the accuracy of 90.34% by extracting instance and using ResNet34 to directly classify. Using ResNet34 to extract deep features followed by using MiSVM classification in the method of instance MiSVM (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) can further improve the performance, reaching 93.49%. Compared with the method of (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>), the proposed method, and (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) have achieved a large performance improvement, which may be due to our further extraction of the tongue tooth-marked informative area.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Performance comparison of different methods for tooth-marked tongue recognition.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Backbone</th>
<th align="center">Methods</th>
<th align="center">Accuracy</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="5" align="left">Resnet34</td>
<td align="left">Image (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>)</td>
<td align="char" char="plusmn">0.7228 &#xb1; 0.0147</td>
<td align="char" char="plusmn">0.7344 &#xb1; 0.1025</td>
<td align="char" char="plusmn">0.5091 &#xb1; 0.1181</td>
<td align="char" char="plusmn">0.5863 &#xb1; 0.0678</td>
</tr>
<tr>
<td align="left">Image (IPW) (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>)</td>
<td align="char" char="plusmn">0.7940 &#xb1; 0.0442</td>
<td align="char" char="plusmn">0.8188 &#xb1; 0.8000</td>
<td align="char" char="plusmn">0.6275 &#xb1; 0.1574</td>
<td align="char" char="plusmn">0.6961 &#xb1; 0.1020</td>
</tr>
<tr>
<td align="left">Instance (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>)</td>
<td align="char" char="plusmn">0.9034 &#xb1; 0.0227</td>
<td align="char" char="plusmn">0.9185 &#xb1; 0.0344</td>
<td align="char" char="plusmn">0.8294 &#xb1; 0.3922</td>
<td align="char" char="plusmn">0.8711 &#xb1; 0.0298</td>
</tr>
<tr>
<td align="left">instance_MiSVM (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>)</td>
<td align="char" char="plusmn">0.9349 &#xb1; 0.0255</td>
<td align="char" char="plusmn">0.9332 &#xb1; 0.0349</td>
<td align="char" char="plusmn">0.9002 &#xb1; 0.0550</td>
<td align="char" char="plusmn">0.9156 &#xb1; 0.0340</td>
</tr>
<tr>
<td align="left">WSTDN</td>
<td align="char" char="plusmn">0.9197 &#xb1; 0.0759</td>
<td align="char" char="plusmn">0.8745 &#xb1; 0.1087</td>
<td align="char" char="plusmn">0.9427 &#xb1; 0.1197</td>
<td align="char" char="plusmn">0.9026 &#xb1; 0.0954</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>However, compared with (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) using MiSVM classification, our performance is not better than it. The reason may be that after the instance is generated, they have performed manual screening to achieve higher performance. On the other hand, compared to the softmax classifier in the proposed end-to-end network, the classification performance of SVM may be better than using softmax classification in the absence of training data (<xref ref-type="bibr" rid="B17">Tang, 2013</xref>; <xref ref-type="bibr" rid="B5">Girshick, 2015</xref>). However, (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) with SVM is not an end-to-end network, while the proposed method of optimizing the softmax classifier can simplify the training and test process (<xref ref-type="bibr" rid="B5">Girshick, 2015</xref>).</p>
</sec>
<sec id="s3-3">
<title>3.3 Performance Assessment of Candidate Region Proposal</title>
<p>For the method of generating candidate regions, we comparatively experimented with selective search (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>), edges box (<xref ref-type="bibr" rid="B24">Zitnick and Doll&#xe1;r, 2014</xref>), convex defect detection (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>), and our method SRP. The definition of a candidate box is that if the number of edge contours that are completely contained in a box, then the target has a high probability in this box (<xref ref-type="bibr" rid="B24">Zitnick and Doll&#xe1;r, 2014</xref>). However, under the condition of tooth-marked tongue classification, it is difficult to frame the tooth-marked area with the method of (<xref ref-type="bibr" rid="B24">Zitnick and Doll&#xe1;r, 2014</xref>), because the tooth-marked area and the tongue are connected. The proposed method and (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) both use the prior knowledge of tooth marks. According to observations, the tooth-marked tongue does have convex defects on the edge of the tongue. Convexity detection can be used to frame the area, but the non-tooth marked tongue convexity area is not obvious, and it is difficult to achieve the unity of tooth-marked and non-tooth marked tongues.</p>
<p>
<xref ref-type="fig" rid="F7">Figure 7</xref> shows the comparison of candidate regions of different candidate region generation methods. The method (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>) in <xref ref-type="fig" rid="F7">Figure 7A</xref> can generate a large number of candidate frames, but there are many invalid candidate regions, and the candidate regions overlap seriously. The method (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) in <xref ref-type="fig" rid="F7">Figure 7B</xref> is very good in selecting the tooth-marked candidate areas, but the convex defect of the tongue tip will be detected, and the part of the tongue tip is not the informative area for the identification of the tooth mark, so it needs to be manually screened and removed later. In our method in <xref ref-type="fig" rid="F7">Figure 7C</xref>, it can efficiently select the informative area for the identification of tooth marks.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Generation of different candidate region proposal methods. <bold>(A)</bold> Selective search (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>), <bold>(B)</bold> convex defect detection (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>), <bold>(C)</bold> SPR (ours). Note that the yellow box is the candidate box of the tooth mark region generated by different methods.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g007.tif"/>
</fig>
<p>From the comparison of the results in <xref ref-type="table" rid="T2">Table 2</xref>, we can find that our SPR method is better than the method of (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>). It may be because our method selects the information area for identification of tooth marks, rather than invalid areas such as the tip of the tongue and the base of the tongue, and our method does not have the large-area overlap of the candidate frames in (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>). In addition, the time consuming of SRP is much less than the method in (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>), probably because they use the color information of the three channels of RGB, while our method uses Gray-scale image, and the traversal method on the left and right sides reduces a lot of traversal time.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Performance comparison of different region proposal methods.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Backbone</th>
<th align="center">Methods</th>
<th align="center">Accuracy</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1-score</th>
<th align="center">Time per image</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="2" align="left">WSTMD</td>
<td align="left">Selective search (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>)</td>
<td align="char" char="plusmn">0.8708 &#xb1; 0.0979</td>
<td align="char" char="plusmn">0.8522 &#xb1; 0.1215</td>
<td align="char" char="plusmn">0.8068 &#xb1; 0.1495</td>
<td align="char" char="plusmn">0.8300 &#xb1; 0.1312</td>
<td align="char" char=".">0.30</td>
</tr>
<tr>
<td align="left">SPR (ours)</td>
<td align="char" char="plusmn">0.9197 &#xb1; 0.0759</td>
<td align="char" char="plusmn">0.8745 &#xb1; 0.1087</td>
<td align="char" char="plusmn">0.9427 &#xb1; 0.1197</td>
<td align="char" char="plusmn">0.9026 &#xb1; 0.0954</td>
<td align="char" char=".">0.19</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-4">
<title>3.4 Ablation Study</title>
<p>As shown in <xref ref-type="table" rid="T3">Table 3</xref>, IW represents using ImageNet weights to initialize the WSTDN model, and TL means using transfer learning to directly train a model for tongue image classification, and then initialize the WSTDN model with its weights. IW &#x2b; TL is the method we proposed, using ImageNet weights to initialize the ResNet34 model, followed by using the tongue images and image-level labels to train the ResNet34 model, and finally using its weights to initialize the WSTMD model. It can be observed that, when we use the IW method to initialize our tooth-marked detection model, the effect is not as good as using the TL method. The TL weight is learned from the tongue image and has a certain ability to discriminate the tongue image, so this transfer learning method achieves a better effect. The reason why it is inferior to the IW &#x2b; TL method may be that the weights are initialized randomly, and there is no good discrimination ability, which may lead to relatively lower performance.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Performance of ablation study in the proposed WSTDN method.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Backbone</th>
<th align="center">Methods</th>
<th align="center">Accuracy</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">WSTDN</td>
<td align="left">IW</td>
<td align="char" char="plusmn">0.7460 &#xb1; 0.0569</td>
<td align="char" char="plusmn">0.8403 &#xb1; 0.1120</td>
<td align="char" char="plusmn">0.4795 &#xb1; 0.2397</td>
<td align="char" char="plusmn">0.5661 &#xb1; 0.1795</td>
</tr>
<tr>
<td align="left">TL</td>
<td align="char" char="plusmn">0.8848 &#xb1; 0.0477</td>
<td align="char" char="plusmn">0.8777 &#xb1; 0.0610</td>
<td align="char" char="plusmn">0.8254 &#xb1; 0.0778</td>
<td align="char" char="plusmn">0.8506 &#xb1; 0.0607</td>
</tr>
<tr>
<td align="left">IW &#x2b; TL</td>
<td align="char" char="plusmn">0.9197 &#xb1; 0.0759</td>
<td align="char" char="plusmn">0.8745 &#xb1; 0.1087</td>
<td align="char" char="plusmn">0.9427 &#xb1; 0.1197</td>
<td align="char" char="plusmn">0.9026 &#xb1; 0.0954</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-5">
<title>3.5 Visualization</title>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> showed some examples of tooth-marked tongue recognition by the proposed method. As shown in <xref ref-type="fig" rid="F8">Figure 8</xref>(A1) and (C1), the edges on both sides of the tongue are flat without tooth marks and there are physiological defects at the root and tip of the tongues, which are not tooth marks. The proposed model avoids these physiological defect areas well, and the areas on both sides of the tongue are identified correctly. Furthermore, as shown in <xref ref-type="fig" rid="F8">Figure 8</xref>(B1), the tooth-marked tongue has distinctive characteristics, including tooth marks and color changes in the tooth pressure area. The proposed model can identify them correctly. As shown in <xref ref-type="fig" rid="F8">Figure 8</xref>(D1), some small color difference changes that are not easy to recognize or easily ignored by human eyes can be accurately identified by the model. The side indicated by arrow in <xref ref-type="fig" rid="F8">Figure 8</xref>(E2) is more obvious than that indicated by the arrow in <xref ref-type="fig" rid="F8">Figure 8</xref>(E1), but the focus area of the model is single, and only one side identification area is concerned. Even if the typical tooth mark area indicated by the arrow in <xref ref-type="fig" rid="F8">Figure 8</xref>(E2) is not focused, the tooth-marked tongue is correctly identified by the identified area. In <xref ref-type="fig" rid="F8">Figure 8</xref>(F1), the model recognition is incorrect. The tongue image is squeezed on the edge of the tongue due to the tension of the tongue when the patient extends the tongue. The model mistakenly recognizes it as a tooth-marked tongue. Therefore, high-quality tongue body imaging is very important for tooth-marked tongue recognition.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Representative cases of tooth-marked tongue recognition by the proposed method. A2, B2, C2, D2, E2, and F2 are original tongue images, while A1, B1, C1, D1, E1 and F1 are corresponding prediction results of tooth marks with bounding boxes.</p>
</caption>
<graphic xlink:href="fphys-13-847267-g008.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<title>4 Discussion</title>
<p>The characteristics of a tongue can reflect the internal health of the body and the severity or progression of the disease in traditional Chinese medicine. Traditional Chinese medicine can distinguish the clinical symptoms and choose appropriate treatment strategies. As one of the most important tongue features, tooth-marked tongue has been used as an effective signature of health in traditional Chinese medicine. Our model may provide an important research paradigm for distinguishing tongue features, diagnosing syndromes of traditional Chinese medicine, tracking disease progression, and evaluating intervention effects, showing its unique potential in clinical applications. Potentially, the proposed method can also be used to evaluate the efficacy of the drug by detecting the tooth marks of the tongue for noninvasive ethnopharmacological evaluation (<xref ref-type="bibr" rid="B20">Wang et al., 2022</xref>). The pathological cause of tooth-marked tongue is the change of microcirculation of the tongue due to the compression of the tongue by the teeth. For example, there are blood supply disorders, local hypoxia, insufficient nutrition, tissue edema, etc. in the area of tooth compression, and eventually tooth marks are formed (<xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>). Previous studies have shown that tooth-marked tongue is closely related to human health and disease. The tooth-marked tongue is related to human gender and age, in which males have fewer tooth marks and women have more tooth marks, and the relationship between the increase of age and the reduction of tooth marks is more obvious (<xref ref-type="bibr" rid="B29">Hsu et al., 2019</xref>). In addition, there is a positive correlation between lung capacity and tooth-marked tongue. The occurrence rate of tooth-marked tongue is higher in patients with moderate or higher abdominal force. The occurrence of tooth-marked tongue in hypertensive patients without anemia is significantly related to the increase in hematocrit. Patients with hypoalbuminemia is mostly pale with tooth-marked tongue (<xref ref-type="bibr" rid="B30">Jing, 2002</xref>). The number of tongue features such as tooth marks, average coverage area, maximum coverage area, minimum coverage area, and organs corresponding to the coverage area can be used as criteria for evaluating chronic kidney disease or breast cancer (<xref ref-type="bibr" rid="B32">Lo et al., 2013</xref>; <xref ref-type="bibr" rid="B31">Chen et al., 2021</xref>). Patients with subacute eczema have a higher incidence of tooth marks than patients with acute eczema and patients with chronic eczema (<xref ref-type="bibr" rid="B33">Yu et al., 2017</xref>).</p>
<p>The proposed SRP module is more in line with the observation rules of traditional Chinese medicine physicians. First, according to the results of (<xref ref-type="bibr" rid="B15">Sun et al., 2019</xref>; <xref ref-type="bibr" rid="B19">Wang et al., 2020</xref>), tooth marks exist on both sides of the tongue, and the tip of the tongue and the center of the tongue are not the main discriminating areas. We use the method of equidistant selection on both sides of the tongue, which can efficiently extract the candidate regions of tooth marks. In contrast, the method of (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>) based on the convexity area detection method can extract the tooth-marked candidate area on the tooth-marked tongue. However, there is little obvious concave and convex information on the non-tooth marked tongue, which makes it are difficult to generate the candidate regions of the tooth-marked and non-tooth marked tongues efficiently and uniformly. In addition, the method of (<xref ref-type="bibr" rid="B18">Uijlings et al., 2013</xref>) does not use the prior knowledge of tooth marks. The generated candidate areas have a large number of invalid frames and a lot of area overlap. It can be seen from <xref ref-type="table" rid="T2">Tables 2</xref>, <xref ref-type="table" rid="T3">3</xref> that our method has advantages in the generation time of candidate tooth-mark areas and model classification performance.</p>
<p>We initialize the CNN model based on ImageNet weights and use the transfer learning method to obtain better tooth-marked tongue detection results. Inspired by (<xref ref-type="bibr" rid="B22">Zhou et al., 2016</xref>), when they trained this scene classification convolutional network, the labels they gave were scene labels without any object calibration. The network neurons naturally evolved into object detectors. Therefore, we consider the tooth-marked tongue recognition as a tooth-marked area detection problem, rather than an instance-level classification problem. Unlike other detection methods such as (<xref ref-type="bibr" rid="B5">Girshick, 2015</xref>), we do not have instance-level labels. Our method is inspired by the weakly supervised deep target detection method (<xref ref-type="bibr" rid="B1">Bilen and Vedaldi, 2016</xref>), which uses image-level labels to classify and detect candidate regions. Based on the candidate regions, the image-level prediction results are obtained. By filtering the scores of our detection branches, we can better locate the tooth-marked area predicted by the model. By comparison, extracting candidate regions by filtering and labeling examples requires a lot of labeling costs in (<xref ref-type="bibr" rid="B11">Li et al., 2019</xref>).</p>
<p>Finally, this study still has certain shortcomings. First, the amount of data in this study is not large enough. The data comes from the same center, and the study of multi-center data has not been carried out, which will be conducted in the future. Secondly, there may be uncertainties in the gold standard label of clinical tooth-marked tongue by the TCM due to the challenging of the recognition of tooth-marks. Providing uncertainty estimates is not only important for a safe decision-making in high-risks fields, but also crucial in fields where the data sources are highly inhomogeneous and labeled data is rare (<xref ref-type="bibr" rid="B3">Gal, 2016</xref>). Uncertainty research (<xref ref-type="bibr" rid="B9">Kendall and Gal, 2017</xref>; <xref ref-type="bibr" rid="B4">Gawlikowski et al., 2021</xref>) will be introduced in the follow-up. In addition, the proposed method is based on the segmented tongue body to distinguish, and the deviation of the tongue body segmentation may bring discriminant bias. The follow-up will consider the construction of multi-task learning for segmentation and detection to make two tasks promote each other, thereby further improving the detection accuracy. Finally, the proposed method has not yet carried out prospective experiments. Since tooth-marked tongues are less than non-tooth marked tongues in clinical practice, there is uneven sample distribution. This is the follow-up model that needs to be considered for clinical prospective experiments.</p>
</sec>
<sec id="s5">
<title>5 Conclusion</title>
<p>In this study, we proposed a weakly supervised learning method of tooth-marked tongue recognition, by pre-training a CNN model that classifies tooth-marked tongues, and then transferring it to the WSTDN with the utilization of only image-level labels (tooth-marked tongue/non-tooth marked tongue) for fine-tuning. Experimental results demonstrate that the proposed method with only image-level label annotations is effective, and its performance is comparable to that of the deep neural network method that requires a large number of instance labels. In addition, this method uses the CNN network for end-to-end training, and the tooth-marked tongue classification is achieved while the tooth-marked areas is located, which is convenient for clinical diagnosis and interpretation. This method is expected to play an important role in the clinical diagnosis of traditional Chinese medicine, especially in noninvasive ethnopharmacological evaluation.</p>
</sec>
</body>
<back>
<sec id="s6">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="s7">
<title>Ethics Statement</title>
<p>The studies involving human participants were reviewed and approved by the Guangzhou University of Chinese Medicine. The patients/participants provided their written informed consent to participate in this study.</p>
</sec>
<sec id="s8">
<title>Author Contributions</title>
<p>JZ: Formal analysis, Methodology, Writing&#x2014;original draft, and Writing&#x2014;review and editing. SL: Formal analysis, Methodology, Writing&#x2014;original draft, and Writing&#x2014;review and editing. XW: Data curation, Writing&#x2014;review and editing. ZY: Data curation, Writing&#x2014;review and editing. XH: Data curation, Writing&#x2014;review and editing. WL: Data curation, Writing&#x2014;review and editing. SZ: Data curation, Writing&#x2014;review and editing. QD: Conceptualization, Writing&#x2014;review and editing. WZ: Conceptualization, Formal analysis, Funding acquisition, and Writing&#x2014;review and editing. All authors have read and approved the current version of the manuscript.</p>
</sec>
<sec id="s9">
<title>Funding</title>
<p>This work is supported by the National Key R&#x26;D Program of China (2017YFC1700106), the National Natural Science Foundation of China (82174224), the Key Research Program of the Chinese Academy of Sciences (ZDRW-ZS-2021-1-2).</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of Interest</title>
<p>Author WL is employed by Beijing Yikang Medical Technology Co., Ltd.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bilen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Vedaldi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Weakly Supervised Deep Detection Networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name> (<publisher-loc>Oxford</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2846</fpage>&#x2013;<lpage>2854</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.311</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Chiu</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>F. M.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>P. C.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>C. C.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>The Tongue Features Associated with Chronic Kidney Disease</article-title>. <source>Medicine (Baltimore)</source> <volume>100</volume> (<issue>9</issue>), <fpage>e25037</fpage>. <pub-id pub-id-type="doi">10.1097/MD.0000000000025037</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chong Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kaiqi Huang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Weiqiang Ren</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Junge Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Maybank</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Large-scale Weakly Supervised Object Localization via Latent Category Learning</article-title>. <source>IEEE Trans. Image Process</source> <volume>24</volume> (<issue>4</issue>), <fpage>1371</fpage>&#x2013;<lpage>1385</lpage>. <pub-id pub-id-type="doi">10.1109/TIP.2015.2396361</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gal</surname>
<given-names>Yarin.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Uncertainty in Deep Learning</source>. </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gawlikowski</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Njieutcheu Tassi</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Humt</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> <article-title>A Survey of Uncertainty in Deep Neural Networks</article-title>. <comment>arXiv preprint arXiv:2107.03342</comment>, <year>2021</year>. </citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Fast R-Cnn</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name> (<publisher-name>Microsoft Research: IEEE</publisher-name>), <fpage>1440</fpage>&#x2013;<lpage>1448</lpage>. <pub-id pub-id-type="doi">10.1109/iccv.2015.169</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X. Y.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S. Q.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Deep Residual Learning for Image Recognition</article-title>. <source>IEEE Conf. Comput. Vis. Pattern Recognit</source>. <volume>2016</volume>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-015-0816-y10.1109/cvpr.2016.90</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition</article-title>,&#x201d; in <conf-name>Proc. ECCV</conf-name> (<publisher-loc>Beijing</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>346</fpage>&#x2013;<lpage>361</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-10578-9_23</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hsu</surname>
<given-names>P. C.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H. K.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y. C.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>H. H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y. P.</given-names>
</name>
<name>
<surname>Chiang</surname>
<given-names>J. Y.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Gender- and Age-Dependent Tongue Features in a Community-Based Population</article-title>. <source>Medicine (Baltimore)</source> <volume>98</volume> (<issue>51</issue>), <fpage>e18350</fpage>. <pub-id pub-id-type="doi">10.1097/MD.0000000000018350</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hsu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lo</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chiang</surname>
<given-names>J. Y.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Automatic Tongue Feature Extraction</article-title>,&#x201d; in <conf-name>Proceedings of the Int. Comput. Symp.</conf-name>, <conf-loc>Tainan, Taiwan</conf-loc>, <conf-date>16-18 Dec. 2010</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>936</fpage>&#x2013;<lpage>941</lpage>. <pub-id pub-id-type="doi">10.1109/compsym.2010.5685377</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wen</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Automatic Construction of Chinese Herbal Prescriptions From Tongue Images Using CNNs and Auxiliary Latent Therapy Topics</article-title>. <source>IEEE Transact. Cybernet.</source> <volume>51</volume> (<issue>2</issue>), <fpage>708</fpage>&#x2013;<lpage>721</lpage>. <pub-id pub-id-type="doi">10.1109/TCYB.2019.2909925</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Tongue Image Quality Assessment Based on a Deep Convolutional Neural Network</article-title>. <source>BMC Med. Inform. Decision Making</source> <volume>21</volume>, <fpage>147</fpage>. <pub-id pub-id-type="doi">10.1186/s12911-021-01508-8</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jing</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>General Situation of Modern Research on Tooth-Marked Tongue (Review)</article-title>. <source>J. Beijing Univer. Tradit. Chin. Med.</source> (<issue>1</issue>), <fpage>57</fpage>&#x2013;<lpage>59</lpage>. </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kendall</surname>
<given-names>Alex.</given-names>
</name>
<name>
<surname>Gal</surname>
<given-names>Yarin.</given-names>
</name>
</person-group> <article-title>What Uncertainties Do We Need in Bayesian Deep Learning for Computer Vision?</article-title> <comment>arXiv preprint arXiv:1703.04977</comment>, <year>2017</year>. </citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). <source>Diagnostics of Traditional Chinese Medicine</source>. <publisher-loc>Beijing</publisher-loc>: <publisher-name>Science Press</publisher-name>. </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Yi</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Tooth-Marked Tongue Recognition Using Multiple Instance Learning and CNN Features</article-title>. <source>IEEE Trans. Cybern.</source> <volume>49</volume> (<issue>2</issue>), <fpage>380</fpage>&#x2013;<lpage>387</lpage>. <pub-id pub-id-type="doi">10.1109/TCYB.2017.2772289</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lo</surname>
<given-names>L.-c.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.-F.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.-J.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>T.-L.</given-names>
</name>
<name>
<surname>Chiang</surname>
<given-names>J. Y.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>The Study on the Agreement between Automatic Tongue Diagnosis System and Traditional Chinese Medicine Practitioners</article-title>. <source>Evidence-Based Complement. Altern. Med.</source> <volume>2012</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1155/2012/505063</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lo</surname>
<given-names>L. C.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>T. L.</given-names>
</name>
<name>
<surname>Chiang</surname>
<given-names>J. Y.</given-names>
</name>
<name>
<surname>Damdinsuren</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Breast Cancer Index: A Perspective on Tongue Diagnosis in Traditional Chinese Medicine</article-title>. <source>J. Tradit. Complement. Med.</source> <volume>3</volume> (<issue>3</issue>), <fpage>194</fpage>&#x2013;<lpage>203</lpage>. <pub-id pub-id-type="doi">10.4103/2225-4110.114901</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nair</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G. E.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Rectified Linear Units Improve Restricted Boltzmann Machines</article-title>,&#x201d; in <conf-name>Proc. 27th Int. Conf. Mach. Learn. (ICML-10)</conf-name>, <conf-loc>Haifa, Isr.</conf-loc>, <conf-date>June 21-24, 2010</conf-date>. </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Russakovsky</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Krause</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Satheesh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Imagenet Large Scale Visual Recognition Challenge</article-title>. <source>Int. J. Comput. Vis.</source> <volume>115</volume>, <fpage>211</fpage>&#x2013;<lpage>252</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-015-0816-y</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Tooth-Marked Tongue Recognition Using Gradient-Weighted Class Activation Maps</article-title>. <source>Future Internet</source> <volume>11</volume>, <fpage>45</fpage>. <pub-id pub-id-type="doi">10.3390/fi11020045</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>An Automatic Recognition of Tooth- Marked Tongue Based on Tongue Region Detection and Tongue Landmark Detection via Deep Learning</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>153470</fpage>&#x2013;<lpage>153478</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3017725</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2013</year>). <source>Deep Learning Using Linear Support Vector Machines</source> in <conf-name>CML 2013 Challenges in Representation Learning Workshop</conf-name> (<publisher-loc>Ontario</publisher-loc>: <publisher-name>arXiv</publisher-name>).</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Uijlings</surname>
<given-names>J. R. R.</given-names>
</name>
<name>
<surname>van de Sande</surname>
<given-names>K. E. A.</given-names>
</name>
<name>
<surname>Gevers</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Smeulders</surname>
<given-names>A. W. M.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Selective Search for Object Recognition</article-title>. <source>Int. J. Comput. Vis.</source> <volume>104</volume>, <fpage>154</fpage>&#x2013;<lpage>171</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-013-0620-5</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Artificial Intelligence in Tongue Diagnosis: Using Deep Convolutional Neural Network for Recognizing Unhealthy Tongue with Tooth-Mark</article-title>. <source>Comput. Struct. Biotechnol. J.</source> <volume>18</volume>, <fpage>973</fpage>&#x2013;<lpage>980</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2020.04.002</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lou</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Huo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pang</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Constructing Tongue Coating Recognition Model Using Deep Transfer Learning to Assist Syndrome Diagnosis and its Potential in Noninvasive Ethnopharmacological Evaluation</article-title>. <source>J. Ethnopharmacology</source> <volume>285</volume> (<issue>1</issue>), <fpage>114905</fpage>. <pub-id pub-id-type="doi">10.1016/j.jep.2021.114905</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A Weakly Supervised Tooth-Mark and Crack Detection Method in Tongue Image</article-title>. <source>Concurr. Computat. Pract. Exper.</source> <volume>33</volume> (<issue>16</issue>), <fpage>e6262</fpage>. <pub-id pub-id-type="doi">10.1002/cpe.6262</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Multi-Task Joint Learning Model for Segmenting and Classifying Tongue Images Using a Deep Neural Network</article-title>. <source>IEEE J. Biomed. Health Inform.</source> <volume>24</volume> (<issue>9</issue>), <fpage>2481</fpage>&#x2013;<lpage>2489</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2020.2986376</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Objective Research on Tongue Manifestation of Patients with Eczema</article-title>. <source>Technol. Health Care</source> <volume>25</volume> (<issue>S1</issue>), <fpage>143</fpage>&#x2013;<lpage>149</lpage>. <pub-id pub-id-type="doi">10.3233/THC-171316</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2017</year>). <source>Tongue Image Analysis</source>. <publisher-loc>New York, NYBerlin Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>. </citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Khosla</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lapedriza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Oliva</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Torralba</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Learning Deep Features for Discriminative Localization</article-title>,&#x201d; in <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition</conf-name> (<publisher-loc>Cambridge</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2921</fpage>&#x2013;<lpage>2929</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.319</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Khosla</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lapedriza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Oliva</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Torralba</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Object Detectors Emerge in Deep Scene CNNs</article-title>,&#x201d; in <conf-name>ICLR</conf-name> (<publisher-loc>Cambridge</publisher-loc>: <publisher-name>arXiv</publisher-name>). </citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zitnick</surname>
<given-names>C. L.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Edge Boxes: Locating Object Proposals from Edges</article-title>,&#x201d; in <conf-name>Proc. ECCV</conf-name> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>391</fpage>&#x2013;<lpage>405</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-10602-1_26</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>