<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2024.1483052</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Microbiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>WSSS-CRAM: precise segmentation of histopathological images via class region activation mapping</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Pan</surname> <given-names>Ningning</given-names></name>
<uri xlink:href="http://loop.frontiersin.org/people/1352832/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Mi</surname> <given-names>Xiangyue</given-names></name>
<uri xlink:href="http://loop.frontiersin.org/people/2821408/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Hongzhuang</given-names></name>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ge</surname> <given-names>Xinting</given-names></name>
<uri xlink:href="http://loop.frontiersin.org/people/2820655/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Sui</surname> <given-names>Xiaodan</given-names></name>
<uri xlink:href="http://loop.frontiersin.org/people/2147366/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Jiang</surname> <given-names>Yanyun</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1992095/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff><institution>Shandong Normal University</institution>, <addr-line>Jinan</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Chen Li, Northeastern University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Changyu Wu, Xuzhou Medical University, China</p>
<p>Jin Gu, Southwest Jiaotong University, China</p>
<p>Yina Wang, Nanjing Forestry University, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Yanyun Jiang <email>yanyun.jiang&#x00040;qq.com</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>10</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1483052</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>08</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>18</day>
<month>09</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Pan, Mi, Li, Ge, Sui and Jiang.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Pan, Mi, Li, Ge, Sui and Jiang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Fast, accurate, and automatic analysis of histopathological images using digital image processing and deep learning technology is a necessary task. Conventional histopathological image analysis algorithms require the manual design of features, while deep learning methods can achieve fast prediction and accurate analysis, but rely on the drive of a large amount of labeled data.</p>
</sec>
<sec>
<title>Methods</title>
<p>In this work, we introduce WSSS-CRAM a weakly-supervised semantic segmentation that can obtain detailed pixel-level labels from image-level annotated data. Specifically, we use a discriminative activation strategy to generate category-specific image activation maps via class labels. The category-specific activation maps are then post-processed using conditional random fields to obtain reliable regions that are directly used as ground-truth labels for the segmentation branch. Critically, the two steps of the pseudo-label acquisition and training segmentation model are integrated into an end-to-end model for joint training in this method.</p>
</sec>
<sec>
<title>Results</title>
<p>Through quantitative evaluation and visualization results, we demonstrate that the framework can predict pixel-level labels from image-level labels, and also perform well when testing images without image-level annotations.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Future, we consider extending the algorithm to different pathological datasets and types of tissue images to validate its generalization capability.</p>
</sec></abstract>
<kwd-group>
<kwd>histopathological image</kwd>
<kwd>precise semantic segmentation</kwd>
<kwd>weakly-supervised method</kwd>
<kwd>category-specific image activation maps</kwd>
<kwd>deep learning</kwd>
</kwd-group>
<counts>
<fig-count count="8"/>
<table-count count="2"/>
<equation-count count="14"/>
<ref-count count="48"/>
<page-count count="16"/>
<word-count count="8691"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Systems Microbiology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Cancer is a leading cause of death worldwide, with increasing incidence and mortality rates, and high treatment costs that impose a heavy burden on families and society (Sung et al., <xref ref-type="bibr" rid="B39">2021</xref>; Ferlay et al., <xref ref-type="bibr" rid="B12">2021</xref>). Histopathological slides are the gold standard for cancer diagnosis, providing not only basic information on tumor grading and subtype classification but also a wealth of information about the tumor microenvironment (TME). This not only plays a crucial role in explaining tumor development and metastasis but also in influencing the treatment outcomes and prognosis of cancer patients. Recent studies have found that the spatial organization of different tissues and cells is highly correlated with tumor progression, and TME features can reveal gene expression in biological pathways (Wang et al., <xref ref-type="bibr" rid="B41">2020</xref>). Therefore, there is an urgent need for detailed segmentation of different tissues for further clinical research.</p>
<p>Clinically, histopathological slides are visually inspected by pathologists and evaluated semi-quantitatively, and the diagnostic results are reflected in the pathology report. Quantitative assessment for research purposes requires manual annotation by pathologists. However, the reproducibility and consistency of manual segmentation have been questioned due to inter-observer annotation differences and inter-observer variability (Wang et al., <xref ref-type="bibr" rid="B41">2020</xref>). Due to the specific data storage format and large size of histopathological slides, specific tools need to be used for viewing and labeling, such as QuPath (Bankhead et al., <xref ref-type="bibr" rid="B5">2017</xref>), which makes data annotation work difficult. In addition, manual annotation is very time-consuming and labor-intensive, requiring several days for detailed segmentation of each histopathological slide. Therefore, public research on histopathological image segmentation is usually limited to partial areas of pathological slides, or uses classification methods to achieve segmentation-like effects on whole-slice histopathological images (Lu et al., <xref ref-type="bibr" rid="B33">2021</xref>; Yan et al., <xref ref-type="bibr" rid="B44">2022</xref>; Pan et al., <xref ref-type="bibr" rid="B36">2023</xref>), with very few studies focusing on tissue segmentation in whole-slide histopathological images (Cardenas et al., <xref ref-type="bibr" rid="B6">2019</xref>; Amgad et al., <xref ref-type="bibr" rid="B3">2022</xref>; Chan et al., <xref ref-type="bibr" rid="B7">2019</xref>).</p>
<p>Therefore, it is imperative to develop fast and efficient methods for the rapid, accurate, and consistent delineation of target tissue areas. Semantic segmentation is a fundamental task in computer vision, and deep learning-based automatic segmentation frameworks have shown remarkable performance in medical image segmentation tasks (Hesamian et al., <xref ref-type="bibr" rid="B18">2019</xref>; Xun et al., <xref ref-type="bibr" rid="B43">2022</xref>), achieving outstanding results in various competitions. Popular models for this task include FCN (Long et al., <xref ref-type="bibr" rid="B32">2015</xref>), U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B38">2015</xref>), V-Net (Milletari et al., <xref ref-type="bibr" rid="B34">2016</xref>), nnU-Net (Isensee et al., <xref ref-type="bibr" rid="B21">2021</xref>), among others. Furthermore, other hybrid models have also demonstrated excellent performance in medical image segmentation (Jin et al., <xref ref-type="bibr" rid="B23">2021</xref>; Leube et al., <xref ref-type="bibr" rid="B28">2023</xref>; He et al., <xref ref-type="bibr" rid="B15">2023</xref>).</p>
<p>However, there are two major challenges in using deep-learning-based segmentation algorithms for histopathological image analysis tasks: (1) the performance of deep learning models heavily relies on the quality and quantity of annotated data, and histopathological image data is difficult to annotate, with pixel-level annotation being even more challenging; (2) tumors from different regions exhibit specificity, resulting in high costs for the transfer learning of trained networks.</p>
<p>Although high-quality pixel-level annotation data is scarce, coarse-grained or image-level annotation data is readily available. In fact, for the problem of analyzing histopathological images, there are publicly available datasets that can be downloaded and used for research, such as TCGA,<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref> which contains tumor and normal tissues from over 11,000 patients. The database provides image-level descriptions of entire tissue pathology slides and corresponding genomic sequencing results. To reduce the need for pixel-level annotated images during model training, researchers have proposed semi-supervised and weakly supervised learning models, which attempt to improve the model&#x00027;s performance by providing unlabeled or image-level annotated data and hoping to improve the model&#x00027;s generalization ability.</p>
<p>Drawing inspiration from weakly-supervised deep learning methods, we propose a weakly-supervised segmentation algorithm based on Class Region Activation Maps (CRAM) for tissue region segmentation in histopathological images. The framework utilizes image-level annotations to obtain Class Activation Maps (CAM) as pseudo-labels for semantic segmentation. The algorithm can be summarized into two main steps: (1) Obtain the CRAM: using a deep learning classification model, high-quality pixel-level pseudo-labels are generated based on image-level labels. (2) Train a segmentation model: the pixel-level pseudo-labels generated in step (1) are used as ground truth for model training. However, salient region activation can exhibit a higher response to a single class, while typically, multiple classes are present in one region of a pathological image. Therefore, this paper uses a Discriminative Activation (DA) layer to generate specific category masks for foreground and background, which serve as initial segmentation responses. To further increase the reliability of the pseudo-labels, this paper introduces a joint training method by merging the two steps into an end-to-end model. Furthermore, a joint loss function is adopted to optimize both branches and then improves the pseudo-labels&#x00027; quality. Furthermore, an additional Conditional Random Field (CRF) operation is performed on the activation regions, which are modified into more reliable regions as pseudo-labels.</p>
<p>This approach primarily focuses on whole-slide images (WSI) of lung adenocarcinoma stained with H&#x00026;E. The research dataset is sourced from the WSSS4LUAD<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref> challenge dataset, with the goal of achieving pixel-level segmentation for normal tissue, tumor epithelium, and tumor-associated stroma within the histopathological sections. <xref ref-type="fig" rid="F1">Figure 1</xref> presents image patches extracted from whole-slide pathology images of lung adenocarcinoma, scanned at a resolution of 0.2517&#x003BC;<italic>m</italic>/<italic>pixel</italic> and 40 &#x000D7; magnification. Corresponding segmentation labels for the three prevalent tissue types are also provided. As depicted, these three tissue types may simultaneously appear within a single image patch, particularly tumor epithelium and tumor-associated stroma, since tumor cells often adhere to the stroma. Thus, tumors and stroma frequently coexist in the same image patch. <xref ref-type="fig" rid="F2">Figure 2</xref> displays examples from the training dataset, where each image patch is annotated with image-level labels indicating the presence of tumor, stroma, and normal tissue. The training dataset encompasses a total of 10,091 image patches. A comprehensive description of the dataset is presented in Section 4.1 of this paper.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Histopathological images of lung adenocarcinoma tissue and their segmentation illustration. The blue area in the image represents the tumor region, the green area represents the stroma region, and the yellow area represents the normal region.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0001.tif"/>
</fig>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Examples from the training set of the WSSS4LUAD Challenge. 1 indicates the presence of the tissue in the image, while 0 indicates the absence of the tissue in the image. <bold>Top row</bold>: Tumor region; <bold>Second row</bold>: Tumor and stroma region; <bold>Third row</bold>: Stroma region; <bold>Fourth row</bold>: Normal region.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0002.tif"/>
</fig>
<p>Our main contributions are illustrated as follows: (1) Proposing a WSSS-CRAM that improves the traditional CAM method by activating corresponding regions for each class in the image, effectively utilizing the supervisory information of image-level labels. (2) Integrating the steps of obtaining pseudo-labels and training the segmentation model into an end-to-end model for joint training. (3) Performing additional post-processing on the activation regions, using a CRF operation to modify the activation regions into more reliable pseudo-label regions.</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>This paper centers on the main research subject of semantically segmenting tissue in lung adenocarcinoma. The pertinent techniques predominantly center on semi-supervised segmentation methods based on CAM. Therefore, before delving into the specifics of the methods, we initially introduce the task of region segmentation in histopathological images of tissues. Following that, we offer a concise analysis of pertinent research concerning semi-supervised segmentation methods.</p>
<sec>
<title>2.1 Histopathological image segmentation</title>
<p>Since the emergence of whole-slide pathology scanning techniques, the utility of whole-slide tissue pathology imaging has been confirmed across various applications within the realm of pathology. Digitized tissue pathology images have facilitated tasks including remote expert consultations, prognostic analysis, and tumor biomarker assessment (Kumar et al., <xref ref-type="bibr" rid="B26">2020</xref>). As scanning technologies and computational capacities have advanced, significant strides have also been made in the domain of tissue pathology image segmentation. Early approaches entailed manual feature extraction, employing models such as support vector machines and Bayesian models for the segmentation of tissue pathology images. For example, Hiary et al. (<xref ref-type="bibr" rid="B19">2013</xref>) employed a Bayesian model to automatically segment stromal tissue in breast tissue pathology images, leveraging color and texture attributes. With the advancement of deep learning techniques, the remarkable performance exhibited by deep learning in image segmentation has prompted its application in the segmentation of tissue pathology images. Among these techniques, FCN and U-Net have emerged as the most frequently employed foundational architectures. For instance, Chen et al. (<xref ref-type="bibr" rid="B8">2017a</xref>) introduced the utilization of a Deep Contour-Aware Network (DCAN) for the segmentation of colonic glands. This model incorporated auxiliary supervision mechanisms to tackle the challenge of gradient vanishing during training (Chen et al., <xref ref-type="bibr" rid="B8">2017a</xref>). This approach secured the first rank in the 2015 MICCAI Gland Segmentation Challenge and the 2015 MICCAI Nuclei Segmentation Challenge. Oskal et al. (<xref ref-type="bibr" rid="B35">2019</xref>) employed a U-Net-based architecture to achieve a positive predictive value of 0.89 &#x000B1; 0.16 and sensitivity of 0.92 &#x000B1; 0.1 in epidermal or non-epidermal pixel classification tasks. In recent years, semi-supervised methods have also gradually been employed in tissue pathology image segmentation tasks to address the issue of limited annotated data (Jin et al., <xref ref-type="bibr" rid="B24">2022</xref>).</p>
<p>Moreover, in recent years, various international competitions have introduced challenges related to the analysis of tissue pathology regions. For instance, the Digestive-System Pathological Detection and Segmentation Challenge (DigestPath 2019) held within MICCAI 2019 (Da et al., <xref ref-type="bibr" rid="B11">2022</xref>; Li et al., <xref ref-type="bibr" rid="B29">2019</xref>) was centered around automating the segmentation of benign and malignant regions within complete tissues. The Multi-organ Nuclei Segmentation and Classification Challenge (MoNuSAC) (Verma et al., <xref ref-type="bibr" rid="B40">2021</xref>) in ISBI 2020 encompassed the identification and segmentation of multiple cell types across four organs. Additionally, the AGGC 2022 (Automated Gleason Grading Challenge) within MICCAI 2022 addressed the automatic segmentation of five tissue types in prostate cancer whole-slide pathology images.</p>
</sec>
<sec>
<title>2.2 Weakly-supervised semantic segmentation utilizing CAM</title>
<p>Instance segmentation, one of the most challenging problems in computer vision, has undergone extensive research (He et al., <xref ref-type="bibr" rid="B16">2017</xref>; Arnab and Torr, <xref ref-type="bibr" rid="B4">2017</xref>; Liu et al., <xref ref-type="bibr" rid="B31">2018</xref>). However, many of these studies necessitate manual annotation of instance masks to provide strong supervision, thereby constraining their utility on datasets with sparsely annotated structures. Semi-supervised and weakly supervised instance segmentation strategies strive to transcend this constraint. In scenarios involving solely image-level categories, synthetic labels extracted from class response maps are harnessed to train networks for paired semantic segmentation (Ahn and Kwak, <xref ref-type="bibr" rid="B2">2018</xref>). Employing a classification model to derive CAM stands as a standardized process for generating pseudo masks in the realm of Weakly Supervised Semantic Segmentation (WSSS).</p>
<sec>
<title>2.2.1 Class activation maps</title>
<p>The Vanilla CAM approach initially scales the feature map using fully connected weights learned for each individual class. Subsequently, seed masks are generated through channel averaging, spatial normalization, and thresholding (Zhou et al., <xref ref-type="bibr" rid="B48">2016</xref>). The GAIN model applies CAM to the original image for mask generation, minimizing model prediction scores to capture features beyond the prior step&#x00027;s activation map in successive training rounds. This gradually refines the activated regions, ensuring complete coverage of the target area (Li et al., <xref ref-type="bibr" rid="B30">2018</xref>). Recently emerged erase-based approaches also embrace similar principles (Zhang et al., <xref ref-type="bibr" rid="B47">2018</xref>; Kweon et al., <xref ref-type="bibr" rid="B27">2021</xref>). The distinction lies in their direct erasure of seed regions in CAM, followed by inputting the erased image into the model to generate the next round&#x00027;s CAM, expected to capture new regions. Moreover, certain schemes have been proposed to optimize CAM. For instance, in Qin et al. (<xref ref-type="bibr" rid="B37">2022</xref>), Activation Modulation and Recalibration Scheme (AMR) employs channel/spatial attention mechanisms for fine-tuning activation area calibration, thereby achieving adaptive modulation for segmentation-oriented activation responses. The ReCAM strategy reactivates CAM activation regions using Softmax Cross-Entropy Loss (SCL), resulting in ReCAM with Binary Cross-Entropy (BCE) constraints (Chen et al., <xref ref-type="bibr" rid="B10">2022</xref>). Embedded Discriminative Attention Mechanism (EDAM) is a recent endeavor that employs CAM-based perturbations to optimize an additional classifier. It employs an extra DA layer to generate class-specific masks (Wu et al., <xref ref-type="bibr" rid="B42">2021</xref>).</p>
</sec>
<sec>
<title>2.2.2 Generation of pseudo-labels</title>
<p>The seed masks generated from CAM or its variations can undergo refinement steps to enhance the quality of pseudo-labels, employing both non-learning-based and learning-based methods. SEC introduced the principles of Seed, Expand, and Constrain for refining CAM, which have been widely adopted by subsequent works (Kolesnikov and Lampert, <xref ref-type="bibr" rid="B25">2016</xref>). Among these, CRF is an earlier post-processing method that is user-friendly, independent of features extracted by the trained model, and relies solely on the original image features. DSRG, inspired by Seeded Region Growing (SRG), employs CAM as seeds to expand regions of interest (Huang et al., <xref ref-type="bibr" rid="B20">2018</xref>). This approach integrates the SRG process into the deep segmentation network, deviating from the previous strategy of training segmentation models using pseudo-labels generated through SRG.</p>
<p>Learning-based methods introduce additional network modules. For example, AffinityNet employs a deep neural network to predict semantic affinities between adjacent image coordinates, achieving semantic propagation through random walks (Ahn and Kwak, <xref ref-type="bibr" rid="B2">2018</xref>). IRNet estimates rough regions of individual instances and detects boundaries between different object classes. It focuses on pixel relations on the graph and computes affinities based on these relations (Ahn et al., <xref ref-type="bibr" rid="B1">2019</xref>). Furthermore, incorporating confidence regions from saliency maps into CAM for pseudo-label refinement has become a common practice in recent methodologies (Chen et al., <xref ref-type="bibr" rid="B10">2022</xref>; Wu et al., <xref ref-type="bibr" rid="B42">2021</xref>). Approaches like OOA (Jiang et al., <xref ref-type="bibr" rid="B22">2019</xref>) and CONTA (Zhang et al., <xref ref-type="bibr" rid="B46">2020b</xref>) integrate CAM inferences generated through multiple training iterations, directing attention accumulation toward various parts of objects.</p>
</sec>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methodology</title>
<p>In this section, the main focus is on introducing the CRAM algorithm framework. We provide a comprehensive explanation of the CNN-based pseudo-label acquisition module, the target semantic segmentation module, and the employed loss functions in the algorithm.</p>
<sec>
<title>3.1 Framework</title>
<p>The foundational model for the CAM-based semi-supervised segmentation algorithm used in this paper is divided into two distinct steps: pseudo-label acquisition and independent segmentation model training modules, as depicted in <xref ref-type="fig" rid="F3">Figure 3</xref>. The pseudo-label acquisition module utilizes a standard image classification network supervised by image-level labels. By accentuating response areas of image-level labels through CAM, it generates pixel-level masks corresponding to each image, serving as pseudo-labels for the semantic segmentation module. The semantic segmentation module can be any end-to-end segmentation network, using the pixel-level pseudo-labels generated by the pseudo-label acquisition module as actual labels for training the model. During inference, segmentation predictions can be achieved solely by utilizing the semantic segmentation module.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Basic conceptual diagram of CAM-based semi-supervised segmentation algorithm, which contains two main modules: <bold>(A)</bold> The pseudo-label acquisition module; <bold>(B)</bold> The semantic segmentation module.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0003.tif"/>
</fig>
<p>The model presented in this paper is based on the algorithm outlined in <xref ref-type="fig" rid="F3">Figure 3</xref> and is divided into two primary modules: the pseudo-label acquisition module and the semantic segmentation module. Differing from the majority of previous methodologies that adopt independent two-step procedures, this paper amalgamates pseudo-label acquisition and semantic segmentation into a cohesive end-to-end model for joint training. As illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>, following feature extraction by a backbone network, the image is directed to both the pseudo-label acquisition module and the semantic segmentation module. The integrated model is subject to joint training via a full loss function.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Weakly-supervised segmentation algorithm based on category activation regions.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0004.tif"/>
</fig>
<p>The pseudo-label acquisition module: Within this module, the model incorporates a DA layer to extract category-specific activation regions. Unlike CAM, which employs a single activation map for classification, the DA layer generates category-specific activation maps for each category. These category-specific activation maps are fused with the original feature layer to derive category-specific feature maps. The self-supervised layer explores collaborative information within and across images in a batch. Ultimately, classification predictions are made based on the collaborative information corresponding to each image. Given that all images in the training set are associated with image-level labels, a binary cross-entropy (BCE) loss function is employed independently for each category.</p>
<p>The semantic segmentation module: This module initially refines the feature maps extracted from the backbone network through a series of convolutional layers. Subsequently, an independent CRF is employed to enhance the category-specific activation maps obtained from the pseudo-label acquisition module. This refinement process helps eliminate mislabeled pixels, resulting in comparatively reliable pseudo-labels. The target semantic segmentation module applies cross-entropy loss and energy loss to the confident and non-confident regions of the pseudo-labels, respectively.</p>
<p>Joint loss function: The loss function is used to supervise the optimization of parameters within the model. In the presented algorithm, the classification and segmentation models are integrated into an end-to-end framework for joint training. As a result, the overall loss function comprises a binary cross-entropy loss for classification, as well as cross-entropy loss and energy loss for segmentation.</p>
</sec>
<sec>
<title>3.2 Pseudo-label acquisition based on CNN classification model</title>
<sec>
<title>3.2.1 Discriminative activation layer</title>
<p>For a given batch of data <italic>X</italic> &#x0003D; {(<sub><italic>x</italic><sub><italic>n</italic></sub>, <italic>l</italic><sub><italic>n</italic></sub>)}<italic>N</italic></sub>, where <italic>N</italic> represents the number of mini-batches, <italic>x</italic><sub><italic>n</italic></sub> represents the <italic>n</italic>-th image in this batch, and <italic>l</italic><sub><italic>n</italic></sub> represents the corresponding class label. It should be noted that <italic>l</italic><sub><italic>n</italic></sub> is represented as {0, 1}<sup><italic>K</italic></sup>, indicating image-level labels corresponding to <italic>K</italic> categories. Backbone network extracts the feature map <inline-formula><mml:math id="M1"><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>R</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> corresponding to image <italic>l</italic><sub><italic>n</italic></sub>, where <italic>C</italic> represents the number of channels in the feature map, and <italic>H</italic> and <italic>W</italic> represent the height and width of the feature map. Connect the DA layer to generate activation maps <inline-formula><mml:math id="M2"><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>R</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> corresponding to <italic>K</italic> target categories. To explicitly represent the background region, in addition to generating activation maps for each category, the DA layer also generates activation maps corresponding to the background.</p>
<p>Applying L2-norm regularization to the activation map <italic>M</italic><sub><italic>n</italic></sub> can generate pixel-level probabilities for the corresponding class or background:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mn>2</mml:mn><mml:mo>-</mml:mo><mml:mo class="qopname">norm</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>After the L2-norm regularization operation, <inline-formula><mml:math id="M4"><mml:msub><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the pixel-level class probability distribution at position (<italic>i, j</italic>), and <inline-formula><mml:math id="M5"><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the probability corresponding to class <italic>k</italic> at position (<italic>i, j</italic>). Through the above operations, activation maps corresponding to each category in the image are obtained.</p>
</sec>
<sec>
<title>3.2.2 Self-supervised layer</title>
<p>Combining the feature map <inline-formula><mml:math id="M6"><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>R</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> corresponding to image <italic>l</italic><sub><italic>n</italic></sub> with the activation map <inline-formula><mml:math id="M7"><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> corresponding to <italic>K</italic> target categories, generates feature maps for each class:</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M9"><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the feature map corresponding to category <italic>k</italic> in the image <italic>l</italic><sub><italic>n</italic></sub>.</p>
<p>For a batch of <italic>B</italic> images, the corresponding feature maps are represented as <inline-formula><mml:math id="M10"><mml:msup><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>F</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:msubsup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>R</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>C</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>. After a 1 &#x000D7; 1 convolution, the feature maps are transformed into activation features <inline-formula><mml:math id="M11"><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>F</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>R</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>C</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>H</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> corresponding to each category. The combination of activation maps with the initial feature maps is used to explore collaborative information specific to category activation maps. The self-supervised layer simultaneously considers feature attention within and between images in a batch, making the exploration of collaborative information more effective. The model generates category-specific feature maps for each category, using global average pooling and employing a specific classifier for label prediction of the given category. Since in histopathological images, one image often corresponds to multiple image categories, to make the activation regions corresponding to categories more effective, this paper transforms the multi-class problem into multiple binary classification problems.</p>
<p>The purpose of the self-supervised layer is to highlight similar regions in the activation maps corresponding to images in a batch through self-attention mechanisms, to obtain better activation maps for each category.</p>
</sec>
<sec>
<title>3.2.3 Classification loss function</title>
<p>The category-specific features output by the self-supervised layer are mapped to categories through a fully connected layer, with image-level labels corresponding to the image as supervision. The classification loss function is represented as:</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>K</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>B</mml:mi><mml:mi>C</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>L</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo class="qopname">SelfAttention</mml:mo><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>F</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M14"><mml:msubsup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the activation map corresponding to input image <italic>x</italic><sub><italic>n</italic></sub> after the self-supervised layer for the <italic>k</italic>-th category, <inline-formula><mml:math id="M15"><mml:msubsup><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></inline-formula> represents the true label of input image <italic>x</italic><sub><italic>n</italic></sub> corresponding to the <italic>k</italic>-th category. Since the input to the self-supervised layer is a combination of category-specific activation maps and initial feature maps, the loss function of the self-supervised layer will, through backpropagation, affect the distinguishing activation layers of all foreground categories, thereby influencing the parameter training of the backbone network.</p>
</sec>
</sec>
<sec>
<title>3.3 Target semantic segmentation model</title>
<sec>
<title>3.3.1 Reliable semantic segmentation labels</title>
<p>From the pseudo-label acquisition module, activation maps corresponding to each category can be obtained, which highlight the regions where each category plays a role in classification. In this activation map, select the high-confidence foreground and background regions as reliable regions, and the remaining regions as unreliable regions. High-confidence maps are represented as:</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M16"><mml:mrow><mml:msup><mml:mi>p</mml:mi><mml:mi>r</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msup><mml:mover accent='true'><mml:mi>M</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>k</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;if&#x000A0;</mml:mtext><mml:msup><mml:mover accent='true'><mml:mi>M</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>k</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x0003C;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mtext>&#x000A0;or&#x000A0;</mml:mtext><mml:msup><mml:mover accent='true'><mml:mi>M</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>k</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x0003E;</mml:mo><mml:mi>&#x003B2;</mml:mi></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mn>255</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;else&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where &#x003B1; and &#x003B2; represent pre-established thresholds. When the threshold falls below &#x003B1;, it signifies the region as a dependable background area; conversely, when the threshold surpasses &#x003B2;, the region is retained as a foreground area.</p>
<p>We employ CRF for post-processing the activation maps, removing incorrectly labeled pixels, and enhancing the probability maps associated with each category:</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>R</mml:mi><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mover accent="false"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Taking into account the constraints imposed by CRF on the activation maps, the ultimate pixel-level pseudo-labels are as follows:</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M18"><mml:mrow><mml:msup><mml:mi>p</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>u</mml:mi><mml:mi>d</mml:mi><mml:mi>o</mml:mi></mml:mrow></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msup><mml:mi>p</mml:mi><mml:mi>r</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;if&#x000A0;</mml:mtext><mml:msup><mml:mi>p</mml:mi><mml:mi>r</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:msup><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mn>255</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mo>.</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>If <italic>p</italic><sup><italic>r</italic></sup>(<italic>i, j</italic>) &#x0003D; <italic>p</italic><sup><italic>crf</italic></sup>(<italic>i, j</italic>), signifying alignment between the high-confidence map and the CRF activation map, we retain this region as the confident pseudo-label area, with the rest designated as non-confident pseudo-label areas.</p>
</sec>
<sec>
<title>3.3.2 Segmentation loss function</title>
<p>The pseudo-labels generated by the model serve as the ground truth labels for training the semantic segmentation module, encompassing both areas with high-confidence pseudo-labels and areas with low-confidence pseudo-labels.</p>
<p>In the case of confident pseudo-label regions, the model utilizes the standard cross-entropy loss function, denoted as:</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:mi>&#x003C6;</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mi>B</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>B</italic>(<italic>i, j</italic>) is a binary label indicating whether the label belongs to class <italic>k</italic>. &#x003C6; represents the confident pseudo-label region, i.e., when <italic>p</italic><sup><italic>pseudo</italic></sup>(<italic>i, j</italic>)&#x02260;255. <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the prediction of the segmentation model.</p>
<p>The model utilizes the dense energy loss function (Zhang et al., <xref ref-type="bibr" rid="B45">2020a</xref>), applied to both confident and non-confident regions, and it is represented as:</p>
<disp-formula id="E9"><label>(9)</label><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>g</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none none none none none none none none none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>b</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02260;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mi>S</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>E</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>S</italic>(<italic>i, j</italic>) represents a soft filter. For regions with confident pseudo-labels, soft filter weights are determined based on the model&#x00027;s predicted class probabilities. In contrast, for regions with non-confident pseudo-labels, a dense energy loss is employed. Cross-entropy loss functions are designed for hard labels, while the pseudo-labels used in this study are not guaranteed to be 100% accurate. Therefore, applying the cross-entropy loss directly to confident regions could introduce errors during model training. The dense energy loss function, using a soft labeling strategy for confident regions, allows for further refinement of the confident regions generated in the preceding step. <italic>S</italic>(<italic>i, j</italic>) is defined as:</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M22"><mml:mrow><mml:mi>S</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mrow><mml:mi>max</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>K</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy='true'>(</mml:mo><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo stretchy='true'>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02208;</mml:mo><mml:mi>&#x003C6;</mml:mi></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;else&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>Here, <italic>E</italic>((<italic>i, j</italic>), (<italic>a, b</italic>)) represents the energy formula that characterizes the relationship between pixel (<italic>i, j</italic>) and pixel (<italic>a, b</italic>):</p>
<disp-formula id="E11"><label>(11)</label><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>E</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none none none none none none none none none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mi>K</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x02260;</mml:mo><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:munder></mml:mstyle><mml:mi>G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>G</italic>((<italic>i, j</italic>), (<italic>a, b</italic>)) is a Gaussian filter.</p>
<p>The total loss function associated with the target semantic segmentation network comprises both cross-entropy loss and energy loss:</p>
<disp-formula id="E12"><label>(12)</label><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>g</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
</sec>
<sec>
<title>3.4 Joint loss function</title>
<p>The approach presented in this paper integrates classification and segmentation models into an end-to-end framework. The overall loss function comprises the <inline-formula><mml:math id="M25"><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> loss function from the pseudo-label acquisition module and the <inline-formula><mml:math id="M26"><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> loss function from the semantic segmentation network. The combined loss function is shown below:</p>
<disp-formula id="E13"><label>(13)</label><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mo>&#x003BB;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mo>&#x003BB;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>g</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003BB; is a weighting coefficient that controls the balance between the pseudo-label acquisition module and the target segmentation module.</p>
</sec>
<sec>
<title>3.5 Independent semantic segmentation model</title>
<p>After weakly-supervised training, the combination of the backbone network and the target semantic segmentation network can serve as an independent inference module for generating semantic segmentation results during the testing phase. Alternatively, the model proposed in this paper can be used as a whole for pseudo-label acquisition. During the training phase, the segmentation model outputs optimized region segmentation results, which are used as artificial pseudo-labels for an independent semantic segmentation model.</p>
<p>Define an independent semantic segmentation module: This semantic segmentation module is designed as a standalone component, utilizing pseudo-labels obtained from the previous step&#x00027;s image classification model as training labels for the training model. During the final inference phase, running inference is as simple as using this trained model. The standalone segmentation model can employ any end-to-end semantic segmentation model as its backbone network, such as FCN (Long et al., <xref ref-type="bibr" rid="B32">2015</xref>), U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B38">2015</xref>), DeepLab v3 (Chen et al., <xref ref-type="bibr" rid="B9">2017b</xref>), and so on. In this paper, we draw inspiration from previous research in weakly-supervised segmentation, where the semantic segmentation module combines the ResNet model and the DeepLab v3 model. This network model consists of two parts: an Encoder based on the ResNet model and a Decoder based on the DeepLab v3.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Algorithm validation and evaluation</title>
<sec>
<title>4.1 Datasets</title>
<p>The dataset used in this paper is publicly available data from the WSSS4LUAD challenge (Han et al., <xref ref-type="bibr" rid="B13">2022a</xref>,<xref ref-type="bibr" rid="B14">b</xref>), which includes 67 H&#x00026;E (Hematoxylin and eosin)-stained WSI (Whole Slide Images) from the Guangdong Provincial People&#x00027;s Hospital (GDPH) and 20 WSI images from the TCGA public dataset. These images have annotations for three common and meaningful tissue types: tumor epithelial tissue, stromal tissue, and normal tissue.</p>
<p>The training dataset in this dataset consists of 63 WSI (49 from GDPH and 14 from TCGA), from which 10,091 image patches were cropped and selected. The image size ranges from 150 &#x000D7; 150 to 300 &#x000D7; 300. Each image in the training set has image-level annotations in the form of a three-digit label [tumor, stroma, normal]. It includes 6,579 images of tumor tissue, 7,076 images of stromal tissue, and 1,832 images of normal tissue. The most common label is [1,1,0], indicating images containing both tumor and stroma, with a total of 5,393 images. This is followed by 1,832 images with the [0,0,1] label (indicating normal tissue), 1,680 images with the [0,1,0] label (indicating stromal tissue), and 1,181 images with the [1,0,0] label (indicating tumor tissue).</p>
<p>The validation set comprises 12 WSI (9 from GDPH and 3 from TCGA), from which 40 image patches are cropped. These include 9 large image patches ranging in size from 1,500 &#x000D7; 1,500 to 5,000 &#x000D7; 5,000 and 31 small image patches ranging in size from 200 &#x000D7; 200 to 500 &#x000D7; 500. The validation dataset has pixel-level labels and is used to validate the trained models.</p>
<p>The test set also consists of 12 WSI (9 from GDPH and 3 from TCGA), from which 80 image patches are cropped. These include 14 large image patches ranging in size from 1,500 &#x000D7; 1,500 to 5,000 &#x000D7; 5,000 and 66 small image patches ranging in size from 200 &#x000D7; 200 to 500 &#x000D7; 500. The test dataset has pixel-level labels and is used for the final model testing.</p>
</sec>
<sec>
<title>4.2 Experimental settings</title>
<p>This experiment was conducted in a PyTorch environment, utilizing NVIDIA CUDA (version 11.4) and cuDNN library (version 8.2.2). All experiments were performed on a computer running Ubuntu 20.04 LTS, using 4 NVIDIA Tesla A100 GPUs with 40GB of VRAM each. The model&#x00027;s backbone network was pre-trained on the ImageNet dataset and further fine-tuned on the target dataset used in this paper.</p>
<p>The model used an SGD optimizer with a batch size of 8, an initial learning rate of 0.001, weight decay set to 0.0002, and momentum set to 0.9. Two hyperparameters, &#x003B1; and &#x003B2;, were set to 0.3 and 0.9, respectively.</p>
<p>During both training and testing, a CRF operation was used to generate refined labels, with parameters following the default values as described in Huang et al. (<xref ref-type="bibr" rid="B20">2018</xref>). During training, the loss functions computed by the classification and segmentation modules were updated through backpropagation to update the backbone network. During testing, only the segmentation module was used to generate region segmentation corresponding to the images.</p>
<p>Considering the irregular sizes of image patches in this dataset, they were standardized through resizing before being fed into the model. During the training phase, the image dimensions were initially randomly increased to two to three times their original size. Subsequently, these enlarged images were uniformly cropped to a size of 513 &#x000D7; 513 pixels, serving as the input images for the model. In the testing phase, the image dimensions were enlarged to 2.5 times their original size, and the model made predictions and generated segmentation results based on the enlarged images. Due to limitations in GPU VRAM, particularly with extremely large pixel images, they were proactively cropped to a fixed size (ranging from 400 &#x000D7; 400 to 500 &#x000D7; 500 in this paper). The model&#x00027;s predicted results were then combined for visualization purposes.</p>
</sec>
<sec>
<title>4.3 Performance evaluation metrics</title>
<p>In the experiments, model evaluation is performed using the mean Intersection over Union (mIoU), which is expressed as follows:</p>
<disp-formula id="E14"><label>(14)</label><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>m</mml:mi><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:mi>U</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where TP stands for true positives (correctly predicted positive instances), while FN and FP represent false negatives (positive instances incorrectly predicted as negative) and false positives (negative instances incorrectly predicted as positive), respectively. The variable <italic>k</italic> denotes the number of classes. In our experiments, the test dataset includes a background label. Therefore, when computing the final mIoU, the background region is excluded and not included in the calculation area.</p>
</sec>
<sec>
<title>4.4 Model analysis</title>
<sec>
<title>4.4.1 Comparison with state-of-the-art methods</title>
<p><xref ref-type="table" rid="T1">Table 1</xref> presents a comparison between our proposed method with the existing fully supervised baseline segmentation methods and the top three performers in the WSSS challenge, including ChunhuiLin, baseline0412, and Vison307, with the best result highlighted in bold. The fully supervised approach was trained using training data containing only one tissue category, with [1,0,0], [0,0,1], and [0,1,0] corresponding to 1,181, 1,832, and 1,680 images, respectively. Among the comparison weakly-supervised methods, including ChunhuiLin, baseline0412, and Vison307 are semi-supervised methods. Training details can be found in the paper (Han et al., <xref ref-type="bibr" rid="B13">2022a</xref>). WSSS-CRAM1 entails training a model exclusively using image-level labels from the training set, without any reference to pixel-level labels from the validation set throughout the training process. Building upon jointly optimized pseudo-labels, WSSS-CRAM2 establishes a separate segmentation module to learn pixel-level pseudo-labels, the model is shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. In contrast, WSSS-CRAM3 incorporates pixel-level labels from the validation set as a supervisory condition when training a separate segmentation model with pseudo-labels. Notably, our proposed approach, when training a dedicated semantic segmentation module and incorporating pixel-level labels from the validation set into the model training, achieves results differing by a mere 0.0012 from the competition&#x00027;s top performance, indicating a remarkable quantitative proximity. This outcome may be attributed to the omission of weight consideration for pseudo-labels compared to the known labels from the validation set during the model training process.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Comparison with the state-of-the-art methods.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left" colspan="2"><bold>Model</bold></th>
<th valign="top" align="center"><bold>mIoU</bold></th>
<th valign="top" align="center"><bold>Tumor</bold></th>
<th valign="top" align="center"><bold>Stroma</bold></th>
<th valign="top" align="center"><bold>Normal</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Supervised</td>
<td valign="top" align="center">U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B38">2015</xref>)</td>
<td valign="top" align="center">0.5362</td>
<td valign="top" align="center">0.4158</td>
<td valign="top" align="center">0.7075</td>
<td valign="top" align="center">0.4854</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">ResNet101 (He et al., <xref ref-type="bibr" rid="B17">2016</xref>)</td>
<td valign="top" align="center">0.5992</td>
<td valign="top" align="center">0.5312</td>
<td valign="top" align="center">0.7323</td>
<td valign="top" align="center">0.5342</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">DeepLab v3 (Chen et al., <xref ref-type="bibr" rid="B9">2017b</xref>)</td>
<td valign="top" align="center">0.6222</td>
<td valign="top" align="center">0.5859</td>
<td valign="top" align="center">0.7318</td>
<td valign="top" align="center">0.5489</td>
</tr>
<tr>
<td valign="top" align="left">Weakly-supervised</td>
<td valign="top" align="center">ChunhuiLin</td>
<td valign="top" align="center"><bold>0.8413</bold></td>
<td valign="top" align="center">0.8389</td>
<td valign="top" align="center">0.8919</td>
<td valign="top" align="center">0.7931</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">baseline0412</td>
<td valign="top" align="center">0.8222</td>
<td valign="top" align="center"><bold>0.8402</bold></td>
<td valign="top" align="center">0.8343</td>
<td valign="top" align="center">0.7921</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">Vison307</td>
<td valign="top" align="center">0.8058</td>
<td valign="top" align="center">0.8165</td>
<td valign="top" align="center">0.8554</td>
<td valign="top" align="center">0.7456</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">WSSS-CRAM1</td>
<td valign="top" align="center">0.7265</td>
<td valign="top" align="center">0.7074</td>
<td valign="top" align="center">0.8125</td>
<td valign="top" align="center">0.6597</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">WSSS-CRAM2</td>
<td valign="top" align="center">0.7618</td>
<td valign="top" align="center">0.7493</td>
<td valign="top" align="center">0.8237</td>
<td valign="top" align="center">0.7125</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">WSSS-CRAM3</td>
<td valign="top" align="center">0.8401</td>
<td valign="top" align="center">0.8293</td>
<td valign="top" align="center"><bold>0.8923</bold></td>
<td valign="top" align="center"><bold>0.7987</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicates best result obtained for predictions.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.4.2 Ablation experiment</title>
<p><xref ref-type="table" rid="T2">Table 2</xref> presents the results of ablation experiments aimed at demonstrating the effectiveness of our method&#x00027;s design. To maintain control over the variables in these experiments, we focused solely on the acquisition of pseudo-labels. In this process, pseudo-labels obtained from the training dataset were combined with pixel-level annotated labels from the validation data to train separate semantic segmentation modules. It&#x00027;s worth noting that CAM, which serves as the foundational strategy for obtaining pixel-level labels from image-level labels, was included in all ablation models. As observed in the table, the joint optimization of segmentation and classification modules yields a significant improvement in segmentation performance. Furthermore, the strategy of DA layer and CRF also contributes to enhancing segmentation performance.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Ablation experiments for each module in the network.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Joint optimization</bold></th>
<th valign="top" align="center"><bold>CAM</bold></th>
<th valign="top" align="center"><bold>DA</bold></th>
<th valign="top" align="center"><bold>CRF</bold></th>
<th valign="top" align="center"><bold>mIoU</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td valign="top" align="center">0.6925</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td valign="top" align="center">0.7680</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">0.7912</td>
</tr>
<tr>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td valign="top" align="center">0.7684</td>
</tr>
<tr>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td valign="top" align="center">0.8059</td>
</tr>
<tr>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center"><bold>0.8401</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicates best result obtained for predictions.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec>
<title>4.5 Visualized results</title>
<sec>
<title>4.5.1 Visual presentation of results</title>
<p><xref ref-type="fig" rid="F5">Figure 5</xref> presents the segmentation results obtained in the test dataset. The first column contains image blocks extracted from the overall histopathological image, the second column showcases the model&#x00027;s predictions, and the third column displays the ground truth labels. In the second row, specific details from the first-row images have been selectively magnified for closer inspection. The result images clearly demonstrate a close alignment between the model&#x00027;s predictions and the ground truth labels. The trained model exhibits the capability to accurately segment regions within histopathological images of lung adenocarcinoma. In the second row of enlarged images, regions, where the model&#x00027;s predictions deviate from the ground truth labels, are enclosed within blue and orange rectangles. Upon a closer examination of the original images, it becomes apparent that the region inside the blue rectangle corresponds to a blank area in the original image, whereas the region within the orange rectangle should indeed be labeled as stroma, consistent with the ground truth. While this result may differ from the manually annotated ground truth, it may offer a more precise representation of the intricate details in comparison to the human-labeled labels. This comparative analysis indicates that the model not only learns pixel-level annotations from image-level labels but also excels in accurately predicting tissue boundaries and intricate details.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Visualization of segmentation results: the first column features the original images, the second column showcases the model&#x00027;s predictions, and the third column reveals the ground truth labels. Notably, red boxes highlight representative regions, which are further magnified in the second row.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0005.tif"/>
</fig>
<p><xref ref-type="fig" rid="F6">Figure 6</xref> showcases various segmentation examples from the test dataset. In the first column, you&#x00027;ll find the original images, while the second column reveals the model&#x00027;s predictions, and the third column displays the ground truth labels. The result images clearly depict that the first and second rows represent image blocks from normal and stroma regions, respectively. In these cases, the model excels in delivering remarkably accurate predictions that closely align with the ground truth labels. Moving to the third row, we encounter images featuring the coexistence of tumors and stroma. Upon close examination, it becomes apparent that the model also produces relatively precise predictions, with minor boundary prediction errors occurring solely at the edges of the tumor and stroma regions. Finally, in the last row of <xref ref-type="fig" rid="F6">Figure 6</xref>, this is a typical example of a large image block, encompassing tumor, stroma, and normal areas. The model&#x00027;s prediction results affirm that the proposed method consistently yields precise segmentation results, even for intricate histopathological images.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Visualization of segmentation results: the first row is normal regions, the second row is stroma regions, the third row includes stroma and tumor, and the fourth row features a large image containing normal, stroma, and tumor areas.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0006.tif"/>
</fig>
</sec>
<sec>
<title>4.5.2 Category-specific activation maps from discriminative activation layer</title>
<p><xref ref-type="fig" rid="F7">Figure 7</xref> presents the activation maps generated by the model after differentiating the activation maps from the activation layer output. The data showcased here is sourced from the training dataset and, therefore, lacks corresponding pixel-level annotations. The four examples shown correspond to image-level labels [1,0,1], [1,1,0], [0,0,1], and [0,1,0], representing tumor and normal, tumor and stroma, normal, and stroma, respectively. In the first column, you can see the original images, while the second column displays the activation maps for tumor regions, the third column displays the activation maps for stroma regions, and the fourth column reveals the activation maps for normal regions. Higher brightness in the activation maps indicates a higher probability of the corresponding region belonging to that class. From these images, it&#x00027;s evident that distinguishing the activation layer enables the generation of activation regions corresponding to each class. Remarkably, even without the explicit use of pixel-level annotations during training to inform the model about specific regions as the tumor, stroma, or normal, weakly supervised learning using only image-level labels demonstrates the ability to produce pixel-level activations, showcasing a crucial feature of CAM.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Category-specific activation maps corresponding to the tumor, stroma, and normal regions.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0007.tif"/>
</fig>
</sec>
<sec>
<title>4.5.3 CRF refinement of pseudo-labels</title>
<p><xref ref-type="fig" rid="F8">Figure 8</xref> demonstrates the refinement of pseudo-labels through CRF operations. The first column showcases the original images, the second column displays the pseudo-labels before CRF refinement, and the third column reveals the pseudo-labels after CRF refinement. Let&#x00027;s compare the state of the labels before and after CRF operations based on these results. From the examples in the first row, it&#x00027;s evident that the pseudo-labels before CRF refinement exhibit distinct boundaries between tumor and normal regions but overlook individual tumor cells present in the finer details. CRF operations, guided by the original image, rectify these boundaries, resulting in a more precise demarcation between tumor and normal regions. In the second row of examples, it becomes apparent that CRF not only refines details but also corrects more extensive areas of segmentation error. The third and fourth rows represent normal and stromal tissues, and a comparison with <xref ref-type="fig" rid="F4">Figures 4</xref>, <xref ref-type="fig" rid="F5">5</xref> reveals that activation maps can emphasize specific classes without clearly defined activation boundaries for the image&#x00027;s boundary details. Consequently, in the pseudo-labels of the second column, only the categories are nearly discernible. After undergoing CRF operations, the distinctions between foreground and background become much clearer.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>Comparison of pseudo-labels before and after CRF operations.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-15-1483052-g0008.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="conclusions" id="s5">
<title>5 Conclusion</title>
<p>This paper proposes a novel weakly-supervised segmentation method based on class region activation mapping, effectively achieving the segmentation of tissue regions in lung adenocarcinoma pathological images. The paper incorporates distinguishing activation layers and self-supervised layers into the classification network to predict activation maps corresponding to each category in the image and explore inter-image collaborative information. Subsequently, pseudo-labels generated from the activation maps are used as training labels for the target semantic segmentation module. The fusion of the pseudo-label prediction module and the target segmentation module allows for better utilization of pixel-level segmentation of target regions with image-level labels. Experimental results on the test set of a publicly available lung adenocarcinoma dataset validate the performance of the weakly-supervised segmentation algorithm based on category-specific activation. Compared to traditional weakly-supervised semantic segmentation methods based on category activation maps, this algorithm exhibits a significant improvement in segmentation accuracy in the literature.</p>
<p>The algorithm has only been validated on a lung adenocarcinoma dataset. Although the algorithm performs well on the lung adenocarcinoma dataset, its generalization ability to other diseases or types of tissue images has not been verified. Therefore, the method&#x00027;s performance on other image datasets may not be as expected. Future, we consider extending the algorithm to different pathological datasets and types of tissue images to validate its generalization capability. Consider integrating pathological images with other types of medical imaging (e.g., CT, MRI) for multimodal analysis to enhance diagnostic accuracy and the applicability of the model.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>NP: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Software, Validation, Visualization, Writing &#x02013; original draft. XM: Data curation, Formal analysis, Investigation, Project administration, Resources, Supervision, Validation, Writing &#x02013; original draft. HL: Conceptualization, Data curation, Formal analysis, Project administration, Writing &#x02013; original draft. XG: Conceptualization, Data curation, Funding acquisition, Resources, Writing &#x02013; original draft. XS: Conceptualization, Data curation, Formal analysis, Writing &#x02013; original draft. YJ: Conceptualization, Data curation, Formal analysis, Investigation, Project administration, Software, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Nature Science Foundation of China (62272283); the Natural Science Foundation of Shandong Province (ZR2021QF135); and the &#x0201C;Young Innovation Team Program&#x0201D; of Shandong Provincial University (2022KJ250).</p>
</sec>
<ack><p>The authors gratefully acknowledge the participants who volunteered to help with the present study.</p>
</ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn0001"><p><sup>1</sup><ext-link ext-link-type="uri" xlink:href="https://www.cancer.gov/about-nci/organization/ccg/research/structural-genomics/tcga">https://www.cancer.gov/about-nci/organization/ccg/research/structural-genomics/tcga</ext-link></p></fn>
<fn id="fn0002"><p><sup>2</sup><ext-link ext-link-type="uri" xlink:href="https://wsss4luad.grand-challenge.org/">https://wsss4luad.grand-challenge.org/</ext-link></p></fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ahn</surname> <given-names>J.</given-names></name> <name><surname>Cho</surname> <given-names>S.</given-names></name> <name><surname>Kwak</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Weakly supervised learning of instance segmentation with inter-pixel relations,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source>, 2209&#x02013;2218. <pub-id pub-id-type="doi">10.1109/CVPR.2019.00231</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ahn</surname> <given-names>J.</given-names></name> <name><surname>Kwak</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Learning pixel-level semantic affinity with image-level supervision for weakly supervised semantic segmentation,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 4981&#x02013;4990. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00523</pub-id><pub-id pub-id-type="pmid">38478447</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Amgad</surname> <given-names>M.</given-names></name> <name><surname>Atteya</surname> <given-names>L. A.</given-names></name> <name><surname>Hussein</surname> <given-names>H.</given-names></name> <name><surname>Mohammed</surname> <given-names>K. H.</given-names></name> <name><surname>Hafiz</surname> <given-names>E.</given-names></name> <name><surname>Elsebaie</surname> <given-names>M. A.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Nucls: a scalable crowdsourcing approach and dataset for nucleus classification and segmentation in breast cancer</article-title>. <source>GigaScience</source> <volume>11</volume>:<fpage>giac037</fpage>. <pub-id pub-id-type="doi">10.1093/gigascience/giac037</pub-id><pub-id pub-id-type="pmid">35579553</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Arnab</surname> <given-names>A.</given-names></name> <name><surname>Torr</surname> <given-names>P. H.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Pixelwise instance segmentation with a dynamically instantiated network,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 441&#x02013;450. <pub-id pub-id-type="doi">10.1109/CVPR.2017.100</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bankhead</surname> <given-names>P.</given-names></name> <name><surname>Loughrey</surname> <given-names>M. B.</given-names></name> <name><surname>Fern&#x000E1;ndez</surname> <given-names>J. A.</given-names></name> <name><surname>Dombrowski</surname> <given-names>Y.</given-names></name> <name><surname>McArt</surname> <given-names>D. G.</given-names></name> <name><surname>Dunne</surname> <given-names>P. D.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Qupath: open source software for digital pathology image analysis</article-title>. <source>Sci. Rep</source>. <volume>7</volume>, <fpage>1</fpage>&#x02013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1038/s41598-017-17204-5</pub-id><pub-id pub-id-type="pmid">29203879</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cardenas</surname> <given-names>C. E.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Anderson</surname> <given-names>B. M.</given-names></name> <name><surname>Court</surname> <given-names>L. E.</given-names></name> <name><surname>Brock</surname> <given-names>K. B.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Advances in auto-segmentation,&#x0201D;</article-title> in <source>Seminars in Radiation Oncology</source> (<publisher-loc>Elsevier</publisher-loc>), <fpage>185</fpage>&#x02013;<lpage>197</lpage>. <pub-id pub-id-type="doi">10.1016/j.semradonc.2019.02.001</pub-id><pub-id pub-id-type="pmid">31027636</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chan</surname> <given-names>L.</given-names></name> <name><surname>Hosseini</surname> <given-names>M. S.</given-names></name> <name><surname>Rowsell</surname> <given-names>C.</given-names></name> <name><surname>Plataniotis</surname> <given-names>K. N.</given-names></name> <name><surname>Damaskinos</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Histosegnet: semantic segmentation of histological tissue type in whole slide images,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source>, 10662&#x02013;10671. <pub-id pub-id-type="doi">10.1109/ICCV.2019.01076</pub-id><pub-id pub-id-type="pmid">36268110</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Qi</surname> <given-names>X.</given-names></name> <name><surname>Yu</surname> <given-names>L.</given-names></name> <name><surname>Dou</surname> <given-names>Q.</given-names></name> <name><surname>Qin</surname> <given-names>J.</given-names></name> <name><surname>Heng</surname> <given-names>P.-A.</given-names></name></person-group> (<year>2017a</year>). <article-title>Dcan: deep contour-aware networks for object instance segmentation from histology images</article-title>. <source>Med. Image Anal</source>. <volume>36</volume>:<fpage>135</fpage>&#x02013;<lpage>146</lpage>. <pub-id pub-id-type="doi">10.1016/j.media.2016.11.004</pub-id><pub-id pub-id-type="pmid">27898306</pub-id></citation></ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>L.-C.</given-names></name> <name><surname>Papandreou</surname> <given-names>G.</given-names></name> <name><surname>Schroff</surname> <given-names>F.</given-names></name> <name><surname>Adam</surname> <given-names>H.</given-names></name></person-group> (<year>2017b</year>). <article-title>Rethinking atrous convolution for semantic image segmentation</article-title>. <source>arXiv preprint arXiv:1706.05587</source>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <name><surname>Wu</surname> <given-names>X.</given-names></name> <name><surname>Hua</surname> <given-names>X.-S.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Sun</surname> <given-names>Q.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Class re-activation maps for weakly-supervised semantic segmentation,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source>, 969&#x02013;978. <pub-id pub-id-type="doi">10.1109/CVPR52688.2022.00104</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Da</surname> <given-names>Q.</given-names></name> <name><surname>Huang</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Zuo</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Digestpath: a benchmark dataset with challenge review for the pathological detection and segmentation of digestive-system</article-title>. <source>Med. Image Anal</source>. <volume>80</volume>:<fpage>102485</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102485</pub-id><pub-id pub-id-type="pmid">35679692</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ferlay</surname> <given-names>J.</given-names></name> <name><surname>Colombet</surname> <given-names>M.</given-names></name> <name><surname>Soerjomataram</surname> <given-names>I.</given-names></name> <name><surname>Parkin</surname> <given-names>D. M.</given-names></name> <name><surname>Pi neros</surname> <given-names>M.</given-names></name> <name><surname>Znaor</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Cancer statistics for the year 2020: an overview</article-title>. <source>Int. J. Cancer</source> <volume>149</volume>, <fpage>778</fpage>&#x02013;<lpage>789</lpage>. <pub-id pub-id-type="doi">10.1002/ijc.33588</pub-id><pub-id pub-id-type="pmid">33818764</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>C.</given-names></name> <name><surname>Lin</surname> <given-names>J.</given-names></name> <name><surname>Mai</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Q.</given-names></name> <name><surname>Zhao</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2022a</year>). <article-title>Multi-layer pseudo-supervision for histopathology tissue semantic segmentation using patch-level classification labels</article-title>. <source>Med. Image Anal</source>. <volume>80</volume>:<fpage>102487</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102487</pub-id><pub-id pub-id-type="pmid">35671591</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>C.</given-names></name> <name><surname>Pan</surname> <given-names>X.</given-names></name> <name><surname>Yan</surname> <given-names>L.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>B.</given-names></name> <name><surname>Yao</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2022b</year>). <article-title>Wsss4luad: Grand challenge on weakly-supervised tissue semantic segmentation for lung adenocarcinoma</article-title>. <source>arXiv preprint arXiv:2204.06455</source>.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>A.</given-names></name> <name><surname>Wang</surname> <given-names>K.</given-names></name> <name><surname>Li</surname> <given-names>T.</given-names></name> <name><surname>Du</surname> <given-names>C.</given-names></name> <name><surname>Xia</surname> <given-names>S.</given-names></name> <name><surname>Fu</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). <article-title>H2former: an efficient hierarchical hybrid transformer for medical image segmentation</article-title>. <source>IEEE Trans. Med. Imaging</source>. <volume>42</volume>, <fpage>2763</fpage>&#x02013;<lpage>2775</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2023.3264513</pub-id><pub-id pub-id-type="pmid">37018111</pub-id></citation></ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Gkioxari</surname> <given-names>G.</given-names></name> <name><surname>Doll&#x000E1;r</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Mask R-CNN,&#x0201D;</article-title> in <source>Proceedings of the IEEE International Conference on Computer Vision</source>, 2961&#x02013;2969. <pub-id pub-id-type="doi">10.1109/ICCV.2017.322</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Identity mappings in deep residual networks,&#x0201D;</article-title> in <source>European Conference on Computer Vision</source> (<publisher-loc>Springer</publisher-loc>), <fpage>630</fpage>&#x02013;<lpage>645</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-46493-0_38</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hesamian</surname> <given-names>M. H.</given-names></name> <name><surname>Jia</surname> <given-names>W.</given-names></name> <name><surname>He</surname> <given-names>X.</given-names></name> <name><surname>Kennedy</surname> <given-names>P.</given-names></name></person-group> (<year>2019</year>). <article-title>Deep learning techniques for medical image segmentation: achievements and challenges</article-title>. <source>J. Digit. Imaging</source> <volume>32</volume>:<fpage>582</fpage>&#x02013;<lpage>596</lpage>. <pub-id pub-id-type="doi">10.1007/s10278-019-00227-x</pub-id><pub-id pub-id-type="pmid">31144149</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hiary</surname> <given-names>H.</given-names></name> <name><surname>Alomari</surname> <given-names>R. S.</given-names></name> <name><surname>Saadah</surname> <given-names>M.</given-names></name> <name><surname>Chaudhary</surname> <given-names>V.</given-names></name></person-group> (<year>2013</year>). <article-title>Automated segmentation of stromal tissue in histology images using a voting bayesian model</article-title>. <source>Signal, Image Video Proc</source>. <volume>7</volume>, <fpage>1229</fpage>&#x02013;<lpage>1237</lpage>. <pub-id pub-id-type="doi">10.1007/s11760-012-0393-2</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Weakly-supervised semantic segmentation network with deep seeded region growing,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 7014&#x02013;7023. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00733</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Isensee</surname> <given-names>F.</given-names></name> <name><surname>Jaeger</surname> <given-names>P. F.</given-names></name> <name><surname>Kohl</surname> <given-names>S. A.</given-names></name> <name><surname>Petersen</surname> <given-names>J.</given-names></name> <name><surname>Maier-Hein</surname> <given-names>K. H.</given-names></name></person-group> (<year>2021</year>). <article-title>NNU-net: a self-configuring method for deep learning-based biomedical image segmentation</article-title>. <source>Nat. Methods</source> <volume>18</volume>, <fpage>203</fpage>&#x02013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-020-01008-z</pub-id><pub-id pub-id-type="pmid">33288961</pub-id></citation></ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jiang</surname> <given-names>P.-T.</given-names></name> <name><surname>Hou</surname> <given-names>Q.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Cheng</surname> <given-names>M.-M.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Xiong</surname> <given-names>H.-K.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Integral object mining via online attention accumulation,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source>, 2070&#x02013;2079. <pub-id pub-id-type="doi">10.1109/ICCV.2019.00216</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>G.</given-names></name> <name><surname>Liu</surname> <given-names>C.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name></person-group> (<year>2021</year>). <article-title>Adversarial network integrating dual attention and sparse representation for semi-supervised semantic segmentation</article-title>. <source>Inf. Proc. Manag</source>. <volume>58</volume>:<fpage>102680</fpage>. <pub-id pub-id-type="doi">10.1016/j.ipm.2021.102680</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>Q.</given-names></name> <name><surname>Cui</surname> <given-names>H.</given-names></name> <name><surname>Sun</surname> <given-names>C.</given-names></name> <name><surname>Zheng</surname> <given-names>J.</given-names></name> <name><surname>Wei</surname> <given-names>L.</given-names></name> <name><surname>Fang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Semi-supervised histological image segmentation via hierarchical consistency enforcement,&#x0201D;</article-title> in <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source> (<publisher-loc>Springer</publisher-loc>), <fpage>3</fpage>&#x02013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-16434-7_1</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kolesnikov</surname> <given-names>A.</given-names></name> <name><surname>Lampert</surname> <given-names>C. H.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Seed, expand and constrain: three principles for weakly-supervised image segmentation,&#x0201D;</article-title> in <source>Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part IV 14</source> (<publisher-loc>Springer</publisher-loc>), <fpage>695</fpage>&#x02013;<lpage>711</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-46493-0_42</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kumar</surname> <given-names>N.</given-names></name> <name><surname>Gupta</surname> <given-names>R.</given-names></name> <name><surname>Gupta</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>Whole slide imaging (WSI) in pathology: current perspectives and future directions</article-title>. <source>J. Digit. Imaging</source> <volume>33</volume>, <fpage>1034</fpage>&#x02013;<lpage>1040</lpage>. <pub-id pub-id-type="doi">10.1007/s10278-020-00351-z</pub-id><pub-id pub-id-type="pmid">32468487</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kweon</surname> <given-names>H.</given-names></name> <name><surname>Yoon</surname> <given-names>S.-H.</given-names></name> <name><surname>Kim</surname> <given-names>H.</given-names></name> <name><surname>Park</surname> <given-names>D.</given-names></name> <name><surname>Yoon</surname> <given-names>K.-J.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Unlocking the potential of ordinary classifier: class-specific adversarial erasing framework for weakly supervised semantic segmentation,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source>, 6994&#x02013;7003. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00691</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Leube</surname> <given-names>J.</given-names></name> <name><surname>Horn</surname> <given-names>M.</given-names></name> <name><surname>Hartrampf</surname> <given-names>P. E.</given-names></name> <name><surname>Buck</surname> <given-names>A. K.</given-names></name> <name><surname>Lassmann</surname> <given-names>M.</given-names></name> <name><surname>Tran-Gia</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>PSMA-PET improves deep learning-based automated CT kidney segmentation</article-title>. <source>Zeitschrift Med. Physik</source>. <volume>34</volume>, <fpage>231</fpage>&#x02013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1016/j.zemedi.2023.08.006</pub-id><pub-id pub-id-type="pmid">37666698</pub-id></citation></ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Yang</surname> <given-names>S.</given-names></name> <name><surname>Huang</surname> <given-names>X.</given-names></name> <name><surname>Da</surname> <given-names>Q.</given-names></name> <name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Hu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;Signet ring cell detection with a semi-supervised learning framework,&#x0201D;</article-title> in <source>Information Processing in Medical Imaging: 26th International Conference, IPMI 2019, Hong Kong, China, June 2-7, 2019, Proceedings 26</source> (<publisher-loc>Springer</publisher-loc>), <fpage>842</fpage>&#x02013;<lpage>854</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-20351-1_66</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>K.</given-names></name> <name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Peng</surname> <given-names>K.-C.</given-names></name> <name><surname>Ernst</surname> <given-names>J.</given-names></name> <name><surname>Fu</surname> <given-names>Y.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Tell me where to look: Guided attention inference network,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 9215&#x02013;9223. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00960</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Qi</surname> <given-names>L.</given-names></name> <name><surname>Qin</surname> <given-names>H.</given-names></name> <name><surname>Shi</surname> <given-names>J.</given-names></name> <name><surname>Jia</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Path aggregation network for instance segmentation,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 8759&#x02013;8768. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00913</pub-id></citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Long</surname> <given-names>J.</given-names></name> <name><surname>Shelhamer</surname> <given-names>E.</given-names></name> <name><surname>Darrell</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Fully convolutional networks for semantic segmentation,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 3431&#x02013;3440. <pub-id pub-id-type="doi">10.1109/CVPR.2015.7298965</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>M. Y.</given-names></name> <name><surname>Williamson</surname> <given-names>D. F.</given-names></name> <name><surname>Chen</surname> <given-names>T. Y.</given-names></name> <name><surname>Chen</surname> <given-names>R. J.</given-names></name> <name><surname>Barbieri</surname> <given-names>M.</given-names></name> <name><surname>Mahmood</surname> <given-names>F.</given-names></name></person-group> (<year>2021</year>). <article-title>Data-efficient and weakly supervised computational pathology on whole-slide images</article-title>. <source>Nat. Biomed. Eng</source> <volume>5</volume>, <fpage>555</fpage>&#x02013;<lpage>570</lpage>. <pub-id pub-id-type="doi">10.1038/s41551-020-00682-w</pub-id><pub-id pub-id-type="pmid">33649564</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Milletari</surname> <given-names>F.</given-names></name> <name><surname>Navab</surname> <given-names>N.</given-names></name> <name><surname>Ahmadi</surname> <given-names>S.-A.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;V-net: fully convolutional neural networks for volumetric medical image segmentation,&#x0201D;</article-title> in <source>2016 Fourth International Conference on 3D Vision (3DV)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>565</fpage>&#x02013;<lpage>571</lpage>. <pub-id pub-id-type="doi">10.1109/3DV.2016.79</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oskal</surname> <given-names>K. R.</given-names></name> <name><surname>Risdal</surname> <given-names>M.</given-names></name> <name><surname>Janssen</surname> <given-names>E. A.</given-names></name> <name><surname>Undersrud</surname> <given-names>E. S.</given-names></name> <name><surname>Gulsrud</surname> <given-names>T. O.</given-names></name></person-group> (<year>2019</year>). <article-title>A u-net based approach to epidermal tissue segmentation in whole slide histopathological images</article-title>. <source>SN Appl. Sci</source>. <volume>1</volume>, <fpage>1</fpage>&#x02013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1007/s42452-019-0694-y</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Pan</surname> <given-names>W.</given-names></name> <name><surname>Yan</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Xu</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Human-machine interactive tissue prototype learning for label-efficient histopathology image segmentation,&#x0201D;</article-title> in <source>International Conference on Information Processing in Medical Imaging</source> (<publisher-loc>Springer</publisher-loc>), <fpage>679</fpage>&#x02013;<lpage>691</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-34048-2_52</pub-id></citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qin</surname> <given-names>J.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Xiao</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Activation modulation and recalibration scheme for weakly supervised semantic segmentation,&#x0201D;</article-title> in <source>Proceedings of the AAAI Conference on Artificial Intelligence</source>, 2117&#x02013;2125. <pub-id pub-id-type="doi">10.1609/aaai.v36i2.20108</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O.</given-names></name> <name><surname>Fischer</surname> <given-names>P.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;U-net: convolutional networks for biomedical image segmentation,&#x0201D;</article-title> in <source>Medical Image Computing and Computer-Assisted Intervention-MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18</source> (<publisher-loc>Springer</publisher-loc>), <fpage>234</fpage>&#x02013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id></citation>
</ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sung</surname> <given-names>H.</given-names></name> <name><surname>Ferlay</surname> <given-names>J.</given-names></name> <name><surname>Siegel</surname> <given-names>R. L.</given-names></name> <name><surname>Laversanne</surname> <given-names>M.</given-names></name> <name><surname>Soerjomataram</surname> <given-names>I.</given-names></name> <name><surname>Jemal</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Global cancer statistics 2020: globocan estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title>. <source>CA Cancer J. Clin</source>. <volume>71</volume>, <fpage>209</fpage>&#x02013;<lpage>249</lpage>. <pub-id pub-id-type="doi">10.3322/caac.21660</pub-id><pub-id pub-id-type="pmid">33538338</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Verma</surname> <given-names>R.</given-names></name> <name><surname>Kumar</surname> <given-names>N.</given-names></name> <name><surname>Patil</surname> <given-names>A.</given-names></name> <name><surname>Kurian</surname> <given-names>N. C.</given-names></name> <name><surname>Rane</surname> <given-names>S.</given-names></name> <name><surname>Graham</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Monusac2020: a multi-organ nuclei segmentation and classification challenge</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>40</volume>, <fpage>3413</fpage>&#x02013;<lpage>3423</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2021.3085712</pub-id><pub-id pub-id-type="pmid">34086562</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Rong</surname> <given-names>R.</given-names></name> <name><surname>Yang</surname> <given-names>D. M.</given-names></name> <name><surname>Fujimoto</surname> <given-names>J.</given-names></name> <name><surname>Yan</surname> <given-names>S.</given-names></name> <name><surname>Cai</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Computational staining of pathology images to study the tumor microenvironment in lung cancer</article-title>. <source>Cancer Res</source>. <volume>80</volume>, <fpage>2056</fpage>&#x02013;<lpage>2066</lpage>. <pub-id pub-id-type="doi">10.1158/0008-5472.CAN-19-1629</pub-id><pub-id pub-id-type="pmid">31915129</pub-id></citation></ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>T.</given-names></name> <name><surname>Huang</surname> <given-names>J.</given-names></name> <name><surname>Gao</surname> <given-names>G.</given-names></name> <name><surname>Wei</surname> <given-names>X.</given-names></name> <name><surname>Wei</surname> <given-names>X.</given-names></name> <name><surname>Luo</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Embedded discriminative attention mechanism for weakly supervised semantic segmentation,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source>, 16765&#x02013;16774. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01649</pub-id><pub-id pub-id-type="pmid">27534393</pub-id></citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xun</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Zhu</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Generative adversarial networks in medical image segmentation: a review</article-title>. <source>Comput. Biol. Med</source>. <volume>140</volume>:<fpage>105063</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.105063</pub-id><pub-id pub-id-type="pmid">34864584</pub-id></citation></ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yan</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Yao</surname> <given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Deep contrastive learning based tissue clustering for annotation-free histopathology image analysis</article-title>. <source>Computerized Medical Imaging and Graphics</source> <volume>97</volume>:<fpage>102053</fpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2022.102053</pub-id><pub-id pub-id-type="pmid">35306442</pub-id></citation></ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>B.</given-names></name> <name><surname>Xiao</surname> <given-names>J.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Sun</surname> <given-names>M.</given-names></name> <name><surname>Huang</surname> <given-names>K.</given-names></name></person-group> (<year>2020a</year>). <article-title>&#x0201C;Reliability does matter: an end-to-end weakly supervised semantic segmentation approach,&#x0201D;</article-title> in <source>Proceedings of the AAAI Conference on Artificial Intelligence</source>, 12765&#x02013;12772. <pub-id pub-id-type="doi">10.1609/aaai.v34i07.6971</pub-id></citation>
</ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Tang</surname> <given-names>J.</given-names></name> <name><surname>Hua</surname> <given-names>X.-S.</given-names></name> <name><surname>Sun</surname> <given-names>Q.</given-names></name></person-group> (<year>2020b</year>). <article-title>&#x0201C;Causal intervention for weakly-supervised semantic segmentation,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, <fpage>655</fpage>&#x02013;<lpage>666</lpage>.</citation>
</ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Feng</surname> <given-names>J.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>T. S.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Adversarial complementary learning for weakly supervised object localization,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 1325&#x02013;1334. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00144</pub-id></citation>
</ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>B.</given-names></name> <name><surname>Khosla</surname> <given-names>A.</given-names></name> <name><surname>Lapedriza</surname> <given-names>A.</given-names></name> <name><surname>Oliva</surname> <given-names>A.</given-names></name> <name><surname>Torralba</surname> <given-names>A.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Learning deep features for discriminative localization,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>, 2921&#x02013;2929. <pub-id pub-id-type="doi">10.1109/CVPR.2016.319</pub-id></citation>
</ref>
</ref-list>
</back>
</article>