<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Oncol.</journal-id>
<journal-title>Frontiers in Oncology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Oncol.</abbrev-journal-title>
<issn pub-type="epub">2234-943X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fonc.2024.1389396</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Oncology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Neighborhood attention transformer multiple instance learning for whole slide image classification</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Aftab</surname>
<given-names>Rukhma</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2252013"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yan</surname>
<given-names>Qiang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1051950"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Juanjuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1387519"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yong</surname>
<given-names>Gao</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Huajie</surname>
<given-names>Yue</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Urrehman</surname>
<given-names>Zia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mohammad Khalid</surname>
<given-names>Faizi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Computer Science and Technology (College of Data Science), Taiyuan University of Technology</institution>, <addr-line>Taiyuan, Shanxi</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Software, North University of China</institution>, <addr-line>Taiyuan, Shanxi</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Respiratory and Critical Care Medicine, Sinopharm Tongmei General Hospital</institution>, <addr-line>Datong, Shanxi</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>First Hospital of Shanxi Medical University, Shanxi Medical University</institution>, <addr-line>Taiyuan, Shanxi</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Alla Reznik, Lakehead University, Canada</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Ashfaq Niaz, Taiyuan University of Technology, China</p>
<p>Fahim Niaz, Wuhan University, China</p>
<p>Muhammad Usman Shoukat, Jilin University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Rukhma Aftab, <email xlink:href="mailto:Rukhma_khan14@yahoo.com">Rukhma_khan14@yahoo.com</email>; Qiang Yan, <email xlink:href="mailto:qiangyan@tyut.edu.cn">qiangyan@tyut.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>29</day>
<month>08</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>14</volume>
<elocation-id>1389396</elocation-id>
<history>
<date date-type="received">
<day>21</day>
<month>02</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>06</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Aftab, Yan, Zhao, Yong, Huajie, Urrehman and Mohammad Khalid</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Aftab, Yan, Zhao, Yong, Huajie, Urrehman and Mohammad Khalid</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Pathologists rely on whole slide images (WSIs) to diagnose cancer by identifying tumor cells and subtypes. Deep learning models, particularly weakly supervised ones, classify WSIs using image tiles but may overlook false positives and negatives due to the heterogeneous nature of tumors. Both cancerous and healthy cells can proliferate in patterns that extend beyond individual tiles, leading to errors at the tile level that result in inaccurate tumor-level classifications.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address this limitation, we introduce NATMIL (Neighborhood Attention Transformer Multiple Instance Learning), which utilizes the Neighborhood Attention Transformer to incorporate contextual dependencies among WSI tiles. NATMIL enhances multiple instance learning by integrating a broader tissue context into the model. Our approach enhances the accuracy of tumor classification by considering the broader tissue context, thus reducing errors associated with isolated tile analysis.</p>
</sec>
<sec>
<title>Results</title>
<p>We conducted a quantitative analysis to evaluate NATMIL&#x2019;s performance against other weakly supervised algorithms. When applied to subtyping non-small cell lung cancer (NSCLC) and lymph node (LN) tumors, NATMIL demonstrated superior accuracy. Specifically, NATMIL achieved accuracy values of 89.6% on the Camelyon dataset and 88.1% on the TCGA-LUSC dataset, outperforming existing methods. These results underscore NATMIL&#x2019;s potential as a robust tool for improving the precision of cancer diagnosis using WSIs.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Our findings demonstrate that NATMIL significantly improves tumor classification accuracy by reducing errors associated with isolated tile analysis. The integration of contextual dependencies enhances the precision of cancer diagnosis using WSIs, highlighting NATMILs&#xb4; potential as a robust tool in pathology.</p>
</sec>
</abstract>
<kwd-group>
<kwd>attention transformer</kwd>
<kwd>whole slide images</kwd>
<kwd>multiple instance learning</kwd>
<kwd>lung cancer</kwd>
<kwd>weakly supervised learning</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="4"/>
<equation-count count="7"/>
<ref-count count="40"/>
<page-count count="10"/>
<word-count count="5110"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Cancer Imaging and Image-directed Interventions</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>The examination of tissue biopsy sections, specifically whole slide images (WSIs), yields a substantial amount of phenotypic data and serves as the fundamental basis for the field of cancer pathology (<xref ref-type="bibr" rid="B1">1</xref>). Recently, there has been significant advancement in the field of deep learning (DL) techniques (<xref ref-type="bibr" rid="B2">2</xref>). These methods have revolutionized the construction of diagnostic machines that exhibit a high level of accuracy. In fact, their performance in tasks related to cancer classification and diagnosis has been seen to be on par with, or even surpass, that of specialists who have undergone extensive training (<xref ref-type="bibr" rid="B3">3</xref>). However, to create effective deep neural network (DNN) models for cancer pathology, it has often been necessary to annotate every WSI on a pixel level using thorough ground-truth descriptions based on expert opinions (<xref ref-type="bibr" rid="B4">4</xref>). The utilization of slide-level labels in a weakly supervised scenario for training DNN classification models has exhibited remarkable accuracy in classifying test data. This achievement has significant implications for the implementation of adaptable mathematical systems for decision-making in clinical practice, as evidenced by previous studies (<xref ref-type="bibr" rid="B5">5</xref>&#x2013;<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>In the context of cancer histology, DNN models do not process WSIs as single images at a time like regular images. Instead, WSIs are commonly broken into smaller units known as &#x201c;tiles&#x201d; that serve as input elements. Using tile-level DL characteristics, the entire WSI and tumor are classified. The Multiple Instance Learning (MIL) framework is used in most weakly supervised WSI classification algorithms to learn the slide-level label from each WSI as a &#x201c;bag&#x201d; of tiles. MIL models are permutation invariant, meaning WSI tiles have no specific ordering, which hinders their deployment and the weakly supervised learning paradigm (<xref ref-type="bibr" rid="B8">8</xref>).</p>
<p>The motivation behind this work is to address the limitations of current weakly supervised methods, which often overlook the spatial dependencies among WSI tiles. This oversight can lead to false positives and negatives, particularly given the heterogeneous nature of tumors. To overcome this challenge, we propose a novel and efficient hierarchical transformer model called Neighborhood Attention Transformer Multiple Instance Learning (NATMIL).</p>
<p>The novelty of our approach lies in the Neighborhood Attention mechanism, which localizes the Self-Attention operation to the nearest neighbors of each pixel, without relying on a predetermined window adjacent to the pixel. This updated definition permits all pixels to possess a uniform rate of attention, which would otherwise be diminished for edge pixels in zero-padded options. As the size of the neighborhood increases, neighborhood attention exhibits similarities to self-attention and can be considered equivalent to self-attention when the neighborhood reaches its maximum size. Moreover, the utilization of local attention offers the additional benefit of preserving translational equivariance, which sets it apart from blocked and window self-attention mechanisms.</p>
<p>We have devised a method called the Neighborhood Attention Transformer (NAT) that performs competitively. In conclusion, our most significant contributions are as follows:</p>
<list list-type="bullet">
<list-item>
<p>Proposing a simple and adaptable sliding window attention mechanism that preserves translational equivariance, approximates self-attention as its span increases, and localizes each pixel&#x2019;s attention span to its closest neighborhood. We contrast Neighborhood Attention with window self-attention, convolutions, and self-attention in terms of accuracy.</p>
</list-item>
<list-item>
<p>Introducing a new hierarchical transformer that leverages Neighborhood Attention (NA)&#x2019;s efficiency, accuracy, and scalability: the Neighborhood Attention Transformer (NAT). We demonstrate its effectiveness on downstream tasks upon classification.</p>
</list-item>
</list>
<p>By addressing the spatial dependencies among WSI tiles and introducing a novel attention mechanism, this work aims to significantly improve the accuracy and reliability of cancer pathology models.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>In the conventional approach, a WSI is commonly partitioned into non-overlapping tiles of a predetermined size. These tiles are subsequently assigned a weak label, determined based on the diagnosis at the slide level, to be utilized as input for a Deep Neural Network (DNN) (<xref ref-type="bibr" rid="B9">9</xref>). The MIL formulation allows for the prediction of a WSI label (cancer yes/no, cancer type) to originate either from the tile predictions (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B10">10</xref>&#x2013;<xref ref-type="bibr" rid="B12">12</xref>) or from a higher-level bag representation arising from the aggregation of the tile features (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B13">13</xref>&#x2013;<xref ref-type="bibr" rid="B15">15</xref>). The former method is referred to as instance based. The latter method, which makes use of bag embeddings (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B14">14</xref>), has been shown to perform better in experiments. Recent bag-embedding-based methods (<xref ref-type="bibr" rid="B16">16</xref>) use attention mechanisms, which give each tile a score reflecting its importance in the overall WSI-level representation. Most contemporary bag-embedding-based methods include attention mechanisms (<xref ref-type="bibr" rid="B16">16</xref>), which award a score to each tile indicating its relative contribution to the overall representation of the WSI. Attention scores facilitate the automated identification of sub-regions that possess significant diagnostic value and provide information for the label at the WSI level (<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B17">17</xref>, <xref ref-type="bibr" rid="B18">18</xref>).</p>
<p>Different attention-based MIL models investigate WSI tissue structure in various ways. Many of them assume that the tiles are unrelated and randomly distributed, which is why they are permutation invariant. Based on this premise, a recent study (<xref ref-type="bibr" rid="B13">13</xref>) suggested an attention-based MIL pooling operator that can be taught to automatically compute the bag embedding as the weighted average of all tile features in the WSI. The adoption and modification of this operator have been extensive, with the inclusion of a clustering layer (<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B20">20</xref>) to enhance the acquisition of semantically rich and distinct class-specific features. Nevertheless, operators that are permutation invariant lack the intrinsic ability to capture the structural dependencies that exist between various tiles in the input. For example, the DSMIL method [DSMIL (<xref ref-type="bibr" rid="B21">21</xref>)] employs a non-local operator to calculate an attention score for each tile. This value is determined by comparing the feature representation of the tile with that of a crucial tile. Recently, transformer-based designs have been introduced to examine the correlations among the various tiles of a whole-slide image (WSI). These architectures typically employ a learnable position-dependent signal to effectively integrate the spatial information of the picture (<xref ref-type="bibr" rid="B22">22</xref>, <xref ref-type="bibr" rid="B23">23</xref>). To optimize for the classification challenge and generate attention scores while concurrently learning the positional embeddings, TransMIL (<xref ref-type="bibr" rid="B24">24</xref>) uses a transformer-like architecture. However, transformer-based methodologies might overlook the fundamental biological processes that regulate the spatial organization of the slide.</p>
<p>The Stand-Alone Self-Attention (SASA) (<xref ref-type="bibr" rid="B25">25</xref>) technique is considered one of the initial sliding window self-attention patterns. Its primary objective is to substitute convolutions in current convolutional neural networks (CNNs) (<xref ref-type="bibr" rid="B26">26</xref>). Striding the feature map extracts key-value pairs like a convolution with zero padding. While accuracy improved, the implementation had high latency despite lower theoretical cost. Sliding window attention, first used in Longformer (<xref ref-type="bibr" rid="B27">27</xref>) for language processing, was later used in Vision Longformer (ViL) (<xref ref-type="bibr" rid="B28">28</xref>). Although Longformer and ViL&#x2019;s implementations differed from SASA, they were unable to grow to larger windows and models due to computational overhead. Liu et&#xa0;al. presented Window and Shifted Window (Swin) Attention (<xref ref-type="bibr" rid="B29">29</xref>), non-sliding window-based self-attention mechanisms (<xref ref-type="bibr" rid="B30">30</xref>) that split feature maps and apply self-attention to each partition individually. Swin Transformer is a pioneering hierarchical vision transformer. The feature maps are pyramid shaped, reducing spatial dimensionality and boosting depth. Swin&#x2019;s structure is widely employed in CNNs, making it compatible with other networks for downstream tasks like detection and segmentation. At ImageNet-1K classification, Swin outscored DeiT, which utilizes a convolutional teacher. Swin Transformer is the leading approach for object detection on MS-COCO and semantic segmentation on ADE20K. To address the slowness of SASA, Vaswani et&#xa0;al. (<xref ref-type="bibr" rid="B31">31</xref>) introduced HaloNet, which employs a new blocked attention pattern. While this modification does violate translational equivariance, the benefits in terms of both performance and memory are acknowledged. Three phases make up HaloNet&#x2019;s attention mechanism: blocking, haloing, and attention. Blocking input feature maps into non-overlapping subsets creates queries. Next, &#x201c;haloed&#x201d; nearby blocks are extracted as keys and values. Attention is then given to extracted queries and key-value pairs. A novel CNN architecture, ConvNeXt, was proposed by Liu et&#xa0;al. (<xref ref-type="bibr" rid="B32">32</xref>), inspired by models like Swin. The aforementioned models do not incorporate attention mechanisms; nevertheless, they demonstrate superior performance compared to Swin in several visual tasks.</p>
<p>Our Neighborhood Attention approach localizes the field of response to a window surrounding each query, eliminating the need for additional strategies like Swin&#x2019;s cyclic shift. We present Neighborhood Attention Transformer, a hierarchy-based transformer-like model using this attention mechanism, and compare its performance to Swin on image classification, object detection, and semantic segmentation.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<p>The NATMIL approach is founded on the premise that the surrounding neighborhood of a tile contains important information on the level of attention allocated to that specific tile by the model. By establishing a parallel between our framework and the process of analyzing a biopsy slide by a pathologist, one might conceptualize the act of zooming in and out of a particular sub-region as a means to comprehensively explore its broader surroundings, so enhancing our understanding of the adjacent micro-environment and tissue.</p>
<p>In NATMIL, the attention score of each tile is recalibrated by combining the attention scores of its surrounding tiles. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> provides an overview of the model. It may be broken down into four parts:</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>An overview of NATMIL model architecture. At first, preprocessing WSIs separates tissue from background. After splitting the WSIs into 256 &#xd7; 256 tiles, a pre-trained feature extractor generates 1,024 feature representations for each tile. Tile feature representations function as input for our Neighborhood Attention Transformer module. This module analyzes each patch and its neighbors, creating neighborhood descriptors and calculating attention coefficients. The output layer combines tile-level attention scores from the previous layer to get a slide categorization score.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1389396-g001.tif"/>
</fig>
<p>1. Each WSI undergoes a preprocessing step in which the tissue area is automatically segmented and divided into several smaller patches.</p>
<p>2. The patch and feature extraction module is composed of a series of convolutional, max pooling, and linear layers. Its purpose is to convert the initial tile input into low-dimensional feature representations. Let <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where each <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Here, <inline-formula>
<mml:math display="inline" id="im3">
<mml:mi>d</mml:mi>
</mml:math>
</inline-formula> represents the embedding dimensions of a tile, <italic>n</italic> represents the number of tiles inside a WSI, and <italic>N</italic> represents the total number of WSIs.</p>
<p>3. An attention vector of dimension <italic>N</italic> &#xd7; 1 is produced by a Neighborhood Attention mechanism with a contrastive learning block that incorporates the localizing self-attention to the nearest neighboring pixels.</p>
<p>4. A feature aggregator and classification layer that combines the slide-level prediction and tile-level attention scores produced by the one prior to it.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Feature extractor</title>
<p>To estimate attention weights across instances that exhibit identical feature representations, we present the use of self-supervised contrastive learning. In this study, we focus on SimCLR (<xref ref-type="bibr" rid="B33">33</xref>), a widely recognized self-supervised learning system. In <xref ref-type="fig" rid="f2"><bold>Figure 2</bold></xref> SimCLR facilitates the acquisition of semantically meaningful feature representations by decreasing the dissimilarity between many augmented iterations of identical picture data.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>SimCLR training and inference. Two augmentations are done on a tile during training. Two augmentations of the same tile are supplied to a pre-trained ResNet-50 on ImageNet with an additional projection head. ResNet-50&#x2019;s final convolutional block and projection head involves minimizing the contrastive across tiles. Features are retrieved from the refined ResNet-50 during inference. In the neighbor attention transformer module, patch distances are determined.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1389396-g002.tif"/>
</fig>
<p>After partitioning the segmented tissue region into tiles, we employ two distinctively enhanced variations of the identical tile as an input to an instance-level feature encoder denoted as <italic>F</italic>(<italic>x</italic>), which is built using a ResNet-50 architecture.</p>
<p>In the NATMIL framework, the last step involves the utilization of a projection head. This projection head is implemented as a multi-layer perceptron (MLP) containing two hidden layers. Its purpose is to transform the feature representations into a distinct space where a contrastive loss function is subsequently applied. During the training process, the feature representations zi and zj, which correspond to both viewpoints of the same tile that are differently augmented and correlated, are utilized in order to decrease adjusted temperature-scaled cross entropy as specified by <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>.</p>
<disp-formula id="eq1">
<label>1</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The function <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents cosine similarity, <italic>&#x3c4;</italic> represents the variable temperature, and <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is the value of a function that evaluates to 1 only if k = i.</p>
<p>
<inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> of each WSI is generated using the ResNet-50 network as the base encoder, whereas n is the quantity of tiles and d is the embedding dimension.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Neighborhood Attention Transformer module</title>
<p>To encode the feature embeddings of the individual tiles, we utilize a transformer, <italic>T</italic>, layer to aggregate the feature embeddings <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where d is the embedding dimensions of a tile, <italic>n</italic> is the number of tiles inside a WSI, and <italic>N</italic> is the number of WSIs.</p>
<p>In this study, we propose the incorporation of a novel mechanism known as Neighborhood Attention (NA). We define attention weights for the <italic>i</italic>-th input with neighborhood size <inline-formula>
<mml:math display="inline" id="im9">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref> as the dot product of the <italic>i</italic>-th input&#x2019;s query projection and its <inline-formula>
<mml:math display="inline" id="im11">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> nearest neighboring key projections. Given an input <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which is a matrix whose rows are <italic>d</italic>-dimensional token vectors, and <italic>X</italic>&#x2019;s linear projections, <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im14">
<mml:mi>V</mml:mi>
</mml:math>
</inline-formula>, and relative positional biases <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<disp-formula id="eq2">
<label>2</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="true">[</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x22c5;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x22c5;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x22c5;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
<mml:mo stretchy="true">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Next, in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref> we define <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, the adjacent values, as a matrix whose rows are the <inline-formula>
<mml:math display="inline" id="im17">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> nearest neighboring value projections of the <italic>i</italic>-th input:</p>
<disp-formula id="eq3">
<label>3</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Next, we define attention for the <italic>i</italic>-th token with neighborhood size <inline-formula>
<mml:math display="inline" id="im18">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> as follows:</p>
<disp-formula id="eq4">
<label>4</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mi>d</mml:mi>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>with the scaling parameter denoted by <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:msqrt>
<mml:mi>d</mml:mi>
</mml:msqrt>
</mml:mrow>
</mml:math>
</inline-formula> as shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>. For each pixel in the feature map, this process is repeated.</p>
<p>With two consecutive 3 &#xd7; 3 convolutions and 2 &#xd7; 2 strides, NAT embeds inputs into a spatial size that is one-fourth that of the input as shown in <xref ref-type="fig" rid="f3"><bold>Figure 3</bold></xref>. This approach bears resemblance to employing a patch and embedding layer that consists of 4 &#xd7; 4 patches. However, it diverges by employing overlapping convolutions instead of non-overlapping ones, thereby introducing valuable inductive biases. However, the utilization of overlapping convolutions would result in an escalation of expenses and an increase in the number of parameters due to the implementation of two convolutions. Nevertheless, we address this issue by reconfiguring the model, achieving an improved trade-off. With the exception of the last level, all four NAT levels are followed by a downsampler. Downsamplers double the number of channels while halving the spatial size. Instead of the 2 &#xd7; 2 non-overlapping convolutions that Swin employs (patch merging), we employ 3 &#xd7; 3 convolutions with 2 &#xd7; 2 strides. As a result of the tokenizer&#x2019;s fourfold downsampling, our model generates feature maps with sizes of <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:mfrac>
<mml:mi>h</mml:mi>
<mml:mn>4</mml:mn>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mn>4</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>h</mml:mi>
<mml:mn>8</mml:mn>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mn>8</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, The motivation for this shift stems from the success of previous CNN structures, which has since led to the development of various hierarchical attention-based approaches, like PVT (<xref ref-type="bibr" rid="B34">34</xref>), ViL (<xref ref-type="bibr" rid="B28">28</xref>), and Swin Transformer (<xref ref-type="bibr" rid="B29">29</xref>). Furthermore, Layer-Scale [29] is employed to provide stability in larger variations. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> presents a visual representation of the entire network structure.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>An overview of NAT, with its hierarchical design. The model begins with a convolutional downsampler and progresses through four successive stages containing numerous NAT Blocks, which are transformer-like encoder layers. The layers consist of a multi-headed neighborhood attention (NA), multi-layered perceptron (MLP), Layer Norm (LN) before each module, and skip connections. Between stages, feature maps are downsampled to half their spatial size and twice in depth.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1389396-g003.tif"/>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Feature aggregation</title>
<p>Aggregate WSI representation <inline-formula>
<mml:math display="inline" id="im21">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is adaptively calculated as a weighted average of individual value vectors, each weighted by <xref ref-type="disp-formula" rid="eq5">Equation 5</xref> its attention score in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<disp-formula id="eq5">
<label>5</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>such that</p>
<disp-formula id="eq6">
<label>6</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>T</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>T</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>T</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>T</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The learnable parameters in this context are denoted as <inline-formula>
<mml:math display="inline" id="im22">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>U</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im23">
<mml:mi>w</mml:mi>
</mml:math>
</inline-formula>. The symbol <inline-formula>
<mml:math display="inline" id="im24">
<mml:mo>&#x2299;</mml:mo>
</mml:math>
</inline-formula> represents element-wise multiplication. The function <inline-formula>
<mml:math display="inline" id="im25">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> refers to the sigmoid non-linearity, whereas <inline-formula>
<mml:math display="inline" id="im26">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents the hyperbolic tangent function.</p>
<p>At last, the classifier layer assigns each slide a score <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
<disp-formula id="eq7">
<label>7</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:msup>
<mml:mi>g</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where c is the total number of classes mentioned in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>. Finally, a classification score is generated by using the representation learned from the well-attended patches to minimize a cross-entropy loss.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets</title>
<p>We conducted several tests using the Camelyon and TCGA-NSCLC datasets, both of which are widely utilized and publicly available. The Camelyon dataset stands out as a particularly significant open resource for studying breast cancer.</p>
<p>Among the largest public breast cancer datasets is Camelyon16 (<xref ref-type="bibr" rid="B35">35</xref>). It comprises a training set of 270 annotated biopsy slides and an official test set of 129 slides from Radboud University Medical Center and University Medical Center Utrecht in the Netherlands.</p>
<p>The TCGA-NSCLC dataset encompasses two distinct subtypes of non-small-cell lung cancer: lung squamous cell carcinoma (TGCA-LUSC) and lung adenocarcinoma (TCGA-LUAD). For LUAD, a total of 541 slides from 478 patients were obtained, while for LUSC, 512 slides from the same 478 cases were collected.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Baseline model</title>
<p>We evaluated the performance of our neighborhood pooling technique through a comparative analysis with classic pooling operators like Mean-pooling and Max-pooling, and various state-of-the-art Multiple Instance Learning (MIL) (<xref ref-type="bibr" rid="B36">36</xref>) methods. These methods include AB-MIL (<xref ref-type="bibr" rid="B37">37</xref>), CLAM-SB, CLAM-MB (<xref ref-type="bibr" rid="B15">15</xref>), MI Net, MIL-RNN (<xref ref-type="bibr" rid="B11">11</xref>), TransMIL (<xref ref-type="bibr" rid="B24">24</xref>), and DTFT-MIL (<xref ref-type="bibr" rid="B38">38</xref>).</p>
<p>The AB-MIL model incorporates attention mechanisms based on the specific attributes of each individual tile. In contrast, the CLAM-SB and CLAM-MB models also utilizeattention pooling operators similar to AB-MIL but are further enhanced by an auxiliary clustering layer. MI Net employs both max pooling and mean pooling techniques to generate the WSI-level embedding. On the other hand, the MIL-RNN model is an aggregation model that utilizes a recurrent neural network. TRANS-MIL utilizes a transformer-based aggregator, while DTFT-MIL employs the class activation map to calculate the positive probability of an instance within the AB-MIL framework.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Implementation</title>
<p>The tissue area was extracted from each slide using the publicly accessible WSI-preprocessing toolkit developed by (<xref ref-type="bibr" rid="B15">15</xref>). Subsequently, this region was divided into non-overlapping patches of size 256 &#xd7; 256 at a magnification of &#xd7;20. It is important to note that variations in parameters during the feature extraction process may result in different training and test sets, potentially leading to varied model performance outcomes. Disseminating the extracted features allows other researchers to utilize the same dataset for training and evaluating their models, facilitating the comparison of different methodologies.</p>
<p>In our pipeline, the Neighborhood Attention Transformer component incorporated Swin&#x2019;s (<xref ref-type="bibr" rid="B29">29</xref>) training configuration module, enabling the implementation of learning rate, iteration-wise cosine schedule, and other hyperparameters. The results are presented below.</p>
</sec>
</sec>
<sec id="s5" sec-type="results">
<label>5</label>
<title>Results</title>
<p>The outcomes of employing the NATMIL methodology for the classification of WSIs in the Camelyon16 and TCGA-NSCLC datasets are displayed in <xref ref-type="table" rid="T1">
<bold>Tables&#xa0;1</bold>
</xref>, <xref ref-type="table" rid="T2">
<bold>2</bold>
</xref>. All tests in this study evaluate the performance using three metrics: the area under the receiver operating characteristic curve (AUC), the slide-level accuracy (ACC) with a threshold of 0.5, and the macro-averaged F1 score. These processes facilitated an acceptable evaluation across multiple techniques and datasets of varying sizes (<xref ref-type="bibr" rid="B39">39</xref>).</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Performance comparison of NATMIL against various baselines on the Camelyon16 datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Method</th>
<th valign="top" align="center">ACC</th>
<th valign="top" align="center">F1</th>
<th valign="top" align="center">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">ABMIL-GATED</td>
<td valign="top" align="center">0.871 &#xb1; 0.025</td>
<td valign="top" align="center">0.842 &#xb1; 0.017</td>
<td valign="top" align="center">0.910 &#xb1; 0.027</td>
</tr>
<tr>
<td valign="top" align="center">MIL-RNN</td>
<td valign="top" align="center">0.872 &#xb1; 0.014</td>
<td valign="top" align="center">0.852 &#xb1; 0.016</td>
<td valign="top" align="center">0.921 &#xb1; 0.027</td>
</tr>
<tr>
<td valign="top" align="center">CLAM-SB</td>
<td valign="top" align="center">0.879 &#xb1; 0.023</td>
<td valign="top" align="center">0.862 &#xb1; 0.020</td>
<td valign="top" align="center">0.926 &#xb1; 0.021</td>
</tr>
<tr>
<td valign="top" align="center">CLAM-MB</td>
<td valign="top" align="center">0.882 &#xb1; 0.026</td>
<td valign="top" align="center">0.868 &#xb1; 0.031</td>
<td valign="top" align="center">0.927 &#xb1; 0.011</td>
</tr>
<tr>
<td valign="top" align="center">TRANSMIL</td>
<td valign="top" align="center">0.884 &#xb1; 0.013</td>
<td valign="top" align="center">0.869 &#xb1; 0.021</td>
<td valign="top" align="center">0.930 &#xb1; 0.013</td>
</tr>
<tr>
<td valign="top" align="center">DTFT-MIL</td>
<td valign="top" align="center">0.885 &#xb1; 0.013</td>
<td valign="top" align="center">0.871 &#xb1; 0.031</td>
<td valign="top" align="center">0.933 &#xb1; 0.021</td>
</tr>
<tr>
<td valign="top" align="center">
<bold>NATMIL</bold>
</td>
<td valign="top" align="center">0.896 &#xb1; 0.013</td>
<td valign="top" align="center">0.872 &#xb1; 0.015</td>
<td valign="top" align="center">0.940 &#xb1; 0.027</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Performance comparison of NATMIL against various baselines on the TCGA-NSCLC datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Method</th>
<th valign="top" align="center">ACC</th>
<th valign="top" align="center">F1</th>
<th valign="top" align="center">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">ABMIL-GATED</td>
<td valign="top" align="center">0.859 &#xb1; 0.013</td>
<td valign="top" align="center">0.852 &#xb1; 0.017</td>
<td valign="top" align="center">0.880 &#xb1; 0.057</td>
</tr>
<tr>
<td valign="top" align="center">MIL-RNN</td>
<td valign="top" align="center">0.864 &#xb1; 0.023</td>
<td valign="top" align="center">0.862 &#xb1; 0.031</td>
<td valign="top" align="center">0.890 &#xb1; 0.038</td>
</tr>
<tr>
<td valign="top" align="center">CLAM-SB</td>
<td valign="top" align="center">0.839 &#xb1; 0.011</td>
<td valign="top" align="center">0.862 &#xb1; 0.023</td>
<td valign="top" align="center">0.897 &#xb1; 0.026</td>
</tr>
<tr>
<td valign="top" align="center">CLAM-MB</td>
<td valign="top" align="center">0.847 &#xb1; 0.009</td>
<td valign="top" align="center">0.866 &#xb1; 0.061</td>
<td valign="top" align="center">0.9320 &#xb1; 0.027</td>
</tr>
<tr>
<td valign="top" align="center">TRANSMIL</td>
<td valign="top" align="center">0.865 &#xb1; 0.020</td>
<td valign="top" align="center">0.872 &#xb1; 0.061</td>
<td valign="top" align="center">0.940 &#xb1; 0.027</td>
</tr>
<tr>
<td valign="top" align="center">DTFT-MIL</td>
<td valign="top" align="center">0.879 &#xb1; 0.022</td>
<td valign="top" align="center">0.862 &#xb1; 0.054</td>
<td valign="top" align="center">0.920 &#xb1; 0.027</td>
</tr>
<tr>
<td valign="top" align="center">
<bold>NATMIL</bold>
</td>
<td valign="top" align="center">0.881 &#xb1; 0.0303</td>
<td valign="top" align="center">0.882 &#xb1; 0.017</td>
<td valign="top" align="center">0.940 &#xb1; 0.027</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The results presented in the tables are further elucidated in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, which illustrates the relationship between the hyperparameter &#x201c;<inline-formula>
<mml:math display="inline" id="im28">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>&#x201d; and the corresponding area under the receiver operating characteristic curve (AUC) values for the Camelyon16 and TCGA-NSCLC histopathology datasets.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The link between the hyperparameter &#x201c;k&#x201d; and the corresponding area under the receiver operating characteristic curve (AUC) values for the Camelyon16 and TCGA-NSCLC histopathology datasets.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1389396-g004.tif"/>
</fig>
<p>The figure demonstrates the impact of varying the neighborhood size &#x201c;<inline-formula>
<mml:math display="inline" id="im29">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>&#x201d; on the performance of the NATMIL model. For lower values of &#x201c;<inline-formula>
<mml:math display="inline" id="im30">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>&#x201d; (i.e., <inline-formula>
<mml:math display="inline" id="im31">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>), the model exhibits similar behavior across both datasets, performing consistently well under identical experimental conditions. This consistency is expected, as nearby tiles convey significant information regarding the risk of a tile being malignant. However, as the value of &#x201c;k&#x201d; increases, there is a progressive decline in the model&#x2019;s performance, except for a notable improvement when &#x201c;<inline-formula>
<mml:math display="inline" id="im32">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>&#x201d; equals 8.</p>
<p>This observed phenomenon can be attributed to recurring patterns within tumors, occurring at intervals of approximately eight tiles. Thus, the significance of employing models capable of capturing both local adjacent information and overall trends in the biopsy is underscored. It is also noteworthy that selecting either &#x201c;<inline-formula>
<mml:math display="inline" id="im33">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>&#x201d; or &#x201c;<inline-formula>
<mml:math display="inline" id="im34">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>&#x201d; consistently yields satisfactory outcomes due to the spatial configuration of tiles and their neighboring elements, reminiscent of a grid-like topology.</p>
<p>NATMIL surpasses all previous MIL models in terms of accuracy and AUC on the Camelyon16 cancer dataset. Notably, within the Camelyon16 dataset, tumor cells might constitute a mere 5% of the WSI. The occurrence of tumor cells in tissue samples is frequently observed at a low frequency, especially in metastatic locations, where tumor cells are distributed among extensive areas of normal cells (<xref ref-type="bibr" rid="B40">40</xref>). Therefore, the NATMIL model, which utilizes a local neighborhood analysis to readjust attention coefficients, demonstrated superior efficacy in detecting medically significant, sparsely distributed malignant spots compared to alternative models. The performance of NATMIL on the Camelyon16 dataset exhibited substantial superiority over the other baselines. The NATMIL model demonstrates a statistically significant improvement of at least 1.5% in terms of AUC compared to other currently available models.</p>
<p>We present the experimental results of the proposed methods on CAMELYON-16 and TCGA lung cancer dataset in comparison to the following baselines methods: i) classic AB-MIL; ii) RNN-based RNN-MIL; iii) attention-based CLAM-SB, CLAM-MB; and iv) transformer-based MIL, Trans-MIL.</p>
<p>For CAMELYON-16, most slides contain only small portions of tumor over the whole tissue region. The proposed NATMIL methods with different features have outperformed other existing MIL methods except Trans-MIL, which used a transformer-based aggregator, while Trans-MIL is significantly larger in model size and computational complexity. The NATMIL achieves significant performance at AUC of 0.7% better than DTFT-MIL, as the model used different feature distillations.</p>
<p>For TCGA lung cancer, the proposed methods also achieve leading performances, with NATMIL obtaining the best AUC value of 94.2%. Due to the significantly larger tumor regions in positive slides, even RNN and DTFT-based MIL methods perform well on the TCGA lung cancer dataset resulting in less obvious superiority of the proposed methods over other existing methods. In comparison, for the much more challenging dataset CAMELYON-16, the proposed method present robustness to the situation of small portions of tumor regions in positive slides.</p>
<p>In the TCGA-NSCLC dataset, it was observed that NATMIL had superior performance compared to the other baselines that were taken into consideration. The max-pooling approach, which employs the max operator as an aggregation function, demonstrated superior performance compared to other methods. The remarkable efficacy of max pooling on this dataset can be attributed to the observation that tumor cells constitute approximately 80% of the WSI in the TCGA-NSCLC dataset. The probability of accurately labeling distinct malignant cells is significantly elevated.</p>
<sec id="s5_1">
<label>5.1</label>
<title>Ablation study</title>
<p>Our ablation investigation examined the efficacy of the Neighborhood Attention (NA) design block and the surrounding attention module. We tested how changing the neighborhood size <italic>k</italic> affected the efficiency of our NATMIL model. As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, we observed that for low values of <italic>k</italic> (i.e., <italic>k</italic> &#x2208; 2,3,4), the model behaved similarly after being trained under identical experimental conditions. This consistency makes sense, given that nearby tiles convey the most significant information regarding the risk of a tile being malignant. The desirability of robustness in the selection of <italic>k</italic> stems from the time-consuming nature of hyperparameter adjustment. However, as the value of <italic>k</italic> increased, there was a progressive decline in the model&#x2019;s performance, except for a notable improvement when <italic>k</italic> equaled 8.</p>
<p>The observed phenomenon can be attributed to the emergence of recurring patterns within tumors, occurring at intervals of approximately eight tiles. This underscores the significance of employing models capable of capturing both local adjacent information and overall trends in the biopsy. It was also noted that the selection of either <italic>k</italic> = 4 or <italic>k</italic> = 8 consistently yielded appropriate outcomes due to the spatial configuration of tiles and their neighboring elements, which exhibit characteristics reminiscent of a grid-like topology.</p>
<p>We examined the impact of our NAT design, which includes convolutional downsampling and a deeper-thinner architecture. To evaluate its effectiveness, we conducted an ablation study comparing models utilizing self-attention and shifted window self-attention. The model was gradually transformed into NAT, and the outcomes are displayed in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. The initial step</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Accuracy performance of different attention and convolutions on the TCGA-NSCLC datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Attention</th>
<th valign="top" align="center">Downsampler</th>
<th valign="top" align="center">#of layers</th>
<th valign="top" align="center">#of heads</th>
<th valign="top" align="center">#MLP Ratio</th>
<th valign="top" align="center">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Self-Attn</td>
<td valign="top" align="center">Patch</td>
<td valign="top" align="center">2, 4, 6, 2</td>
<td valign="top" align="center">3</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">0.9061</td>
</tr>
<tr>
<td valign="top" align="center">Window self-Attn</td>
<td valign="top" align="center">Conv</td>
<td valign="top" align="center">2, 4, 6, 2</td>
<td valign="top" align="center">3</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">0.9131</td>
</tr>
<tr>
<td valign="top" align="center">Neighbor Attn</td>
<td valign="top" align="center">Conv</td>
<td valign="top" align="center">3, 4, 18, 5</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">3</td>
<td valign="top" align="center">0.9210</td>
</tr>
<tr>
<td valign="top" align="center">Convolution</td>
<td valign="top" align="center">Conv</td>
<td valign="top" align="center">3, 4, 18, 5</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">3</td>
<td valign="top" align="center">0.9127</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>involved substituting the patched embedding and patched merge techniques with the overlapping convolution design employed in the Neighborhood Attention Transformer (NAT) model. This led to an increase in accuracy of approximately 0.5%. Upon implementing the second phase of reducing the model size and computational load by increasing its depth and reducing its width, an approximate improvement in accuracy of 0.9% compared to the initial step was observed. As a result, a minor decrease in accuracy was observed. Nevertheless, by substituting Window-Shifted Attention and Self-Window-Shifted Attention with our Neighborhood Attention, a notable enhancement of 0.5% in accuracy was observed.</p>
<p>Additionally, we conducted a kernel size investigation as shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. The experiment involved varying kernel sizes from 3&#xd7;3 to 9&#xd7;9 in order to examine the impact on the performance of our model.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Performance comparison of NATMIL with different kernel size on TCGA-LUSC datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Kernel size</th>
<th valign="top" align="center">ACC</th>
<th valign="top" align="center">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">3&#xd7;3</td>
<td valign="top" align="center">0.8900 &#xb1; 0.0137</td>
<td valign="top" align="center">0.9260 &#xb1; 0.3206</td>
</tr>
<tr>
<td valign="top" align="center">5&#xd7;5</td>
<td valign="top" align="center">0.8810 &#xb1; 0.9938</td>
<td valign="top" align="center">0.9263 &#xb1; 0.2637</td>
</tr>
<tr>
<td valign="top" align="center">7&#xd7;7</td>
<td valign="top" align="center">0.8920 &#xb1; 0.0545</td>
<td valign="top" align="center">0.9304 &#xb1; 0.5445</td>
</tr>
<tr>
<td valign="top" align="center">9&#xd7;9</td>
<td valign="top" align="center">0.8980 &#xb1; 0.0131</td>
<td valign="top" align="center">0.9401 &#xb1; 0.1238</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusion</title>
<p>In this paper, we present the first effective and scalable sliding window attention technique for vision, called Neighborhood Attention. The first aggregation method employs the independence assumption to provide an attention score for each tile in the picture, whereas the second uses vision transformers to produce an attention score that accounts for the correlation between tiles.</p>
<p>To re-adjust the estimated attention ratings based on the similarities they share, we have introduced NATMIL, a unique MIL vision transformer-based method that considers the interdependence of nearby tiles in a histopathological image. By leveraging the pathologists&#x2019; existing slide-level labeling, NATMIL improves performance, reduces their burden, and makes more data available.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <uri xlink:href="https://portal.gdc.cancer.gov/">https://portal.gdc.cancer.gov/</uri>.</p>
</sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>RA: Writing &#x2013; original draft. QY: Funding acquisition, Project administration, Supervision, Writing &#x2013; review &amp; editing. JZ:&#xa0;Conceptualization, Writing &#x2013; review &amp; editing. GY: Methodology, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. YH: Methodology, Writing &#x2013; review &amp; editing. ZU: Writing &#x2013; review &amp; editing. FM: Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s9" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This study was funded by the National Natural Science Foundation of China(NFSC) No. 62376183, No. U21A20469, and No. 61972274 and Central guidance for local scientific and technological development funds, No. YDZJSX2022C004.</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>We thank to reviewers and editors for constructive and valuable advice to improve this article.</p>
</ack>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Faguet</surname> <given-names>GB</given-names>
</name>
</person-group>. <article-title>A brief history of cancer: age-old milestones underlying our current knowledge database</article-title>. <source>Int J Cancer</source>. (<year>2015</year>) <volume>136</volume>:<page-range>2022&#x2013;36</page-range>. doi: <pub-id pub-id-type="doi">10.1002/ijc.29134</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>UrRehman</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Qiang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Khattak</surname> <given-names>SU</given-names>
</name>
<etal/>
</person-group>. <article-title>Effective lung nodule detection using deep cnn with dual attention mechanisms</article-title>. <source>Sci Rep</source>. (<year>2024</year>
<elocation-id>3934</elocation-id>) <volume>14</volume>. doi: <pub-id pub-id-type="doi">10.1038/s41598-024-51833-x</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morales</surname> <given-names>S</given-names>
</name>
<name>
<surname>Engan</surname> <given-names>K</given-names>
</name>
<name>
<surname>Naranjo</surname> <given-names>V</given-names>
</name>
</person-group>. <article-title>Artificial intelligence in computational pathology&#x2013; challenges and future directions</article-title>. <source>Digital Signal Process</source>. (<year>2021</year>) <volume>119</volume>:<fpage>103196</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.dsp.2021.103196</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Melendez</surname> <given-names>J</given-names>
</name>
<name>
<surname>Van Ginneken</surname> <given-names>B</given-names>
</name>
<name>
<surname>Maduskar</surname> <given-names>P</given-names>
</name>
<name>
<surname>Philipsen</surname> <given-names>RH</given-names>
</name>
<name>
<surname>Reither</surname> <given-names>K</given-names>
</name>
<name>
<surname>Breuninger</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. <article-title>A novel multiple-instance learning-based approach to computer-aided detection of tuberculosis on chest x-rays</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2014</year>) <volume>34</volume>:<page-range>179&#x2013;92</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMI.2014.2350539</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>G</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Ku</surname> <given-names>C</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>C</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Camel: A weakly supervised learning framework for histopathology image segmentation</article-title>, in: <conf-name>Proceedings of the IEEE/CVF International Conference on computer vision</conf-name>, . pp. <page-range>10682&#x2013;91</page-range>.</citation>
</ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>J-Y</given-names>
</name>
<name>
<surname>Eric</surname> <given-names>I</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>C</given-names>
</name>
<name>
<surname>Lai</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tu</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>Weakly supervised histopathology cancer image segmentation and classification</article-title>. <source>Med image Anal</source>. (<year>2014</year>) <volume>18</volume>:<fpage>591</fpage>&#x2013;<lpage>604</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.media.2014.01.010</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>C</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>S</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>R</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<etal/>
</person-group>. <article-title>Histopathology classification and localization of colorectal cancer using global labels by weakly supervised deep learning</article-title>. <source>Computerized Med Imaging Graphics</source>. (<year>2021</year>) <volume>88</volume>:<fpage>101861</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compmedimag.2021.101861</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sharma</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Shrivastava</surname> <given-names>A</given-names>
</name>
<name>
<surname>Ehsan</surname> <given-names>L</given-names>
</name>
<name>
<surname>Moskaluk</surname> <given-names>CA</given-names>
</name>
<name>
<surname>Syed</surname> <given-names>S</given-names>
</name>
<name>
<surname>Brown</surname> <given-names>D</given-names>
</name>
</person-group>. <article-title>Cluster-toconquer: A framework for end-to-end multi-instance learning for whole slide image classification</article-title>. In: <source>Medical imaging with deep learning</source>. <publisher-name>PMLR</publisher-name> (<year>2021</year>). p. <page-range>682&#x2013;98</page-range>.</citation>
</ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aftab</surname> <given-names>R</given-names>
</name>
<name>
<surname>Qiang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>J</given-names>
</name>
<name>
<surname>Urrehman</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>Graph neural network for representation learning of lung cancer</article-title>. <source>BMC Cancer</source>. (<year>2023</year>) <volume>23</volume>:<fpage>1037</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12885-023-11516-8</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hou</surname> <given-names>L</given-names>
</name>
<name>
<surname>Samaras</surname> <given-names>D</given-names>
</name>
<name>
<surname>Kurc</surname> <given-names>TM</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Davis</surname> <given-names>JE</given-names>
</name>
<name>
<surname>Saltz</surname> <given-names>JH</given-names>
</name>
</person-group>. (<year>2016</year>). <article-title>Patch-based convolutional neural network for whole slide tissue image classification</article-title>, in: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, . pp. <page-range>2424&#x2013;33</page-range>.</citation>
</ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Campanella</surname> <given-names>G</given-names>
</name>
<name>
<surname>Hanna</surname> <given-names>MG</given-names>
</name>
<name>
<surname>Geneslaw</surname> <given-names>L</given-names>
</name>
<name>
<surname>Miraflor</surname> <given-names>A</given-names>
</name>
<name>
<surname>Werneck Krauss Silva</surname> <given-names>V</given-names>
</name>
<name>
<surname>Busam</surname> <given-names>KJ</given-names>
</name>
<etal/>
</person-group>. <article-title>Clinical-grade computational pathology using weakly supervised deep learning on whole slide images</article-title>. <source>Nat Med</source>. (<year>2019</year>) <volume>25</volume>:<page-range>1301&#x2013;9</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41591-019-0508-1</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Landini</surname> <given-names>G</given-names>
</name>
<name>
<surname>Martinelli</surname> <given-names>G</given-names>
</name>
<name>
<surname>Piccinini</surname> <given-names>F</given-names>
</name>
</person-group>. <article-title>Colour deconvolution: stain unmixing in histological imaging</article-title>. <source>Bioinformatics</source>. (<year>2021</year>) <volume>37</volume>:<page-range>1485&#x2013;7</page-range>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa847</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ilse</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tomczak</surname> <given-names>J</given-names>
</name>
<name>
<surname>Welling</surname> <given-names>M</given-names>
</name>
</person-group>. (<year>2018</year>). <article-title>Attention-based deep multiple instance learning</article-title>, in: <conf-name>International conference on machine learning (PMLR)</conf-name>, . pp. <page-range>2127&#x2013;36</page-range>.</citation>
</ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>P</given-names>
</name>
<name>
<surname>Bai</surname> <given-names>X</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>W</given-names>
</name>
</person-group>. <article-title>Revisiting multiple instance neural networks</article-title>. <source>Pattern Recognition</source>. (<year>2018</year>) <volume>74</volume>:<fpage>15</fpage>&#x2013;<lpage>24</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.patcog.2017.08.026</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname> <given-names>MY</given-names>
</name>
<name>
<surname>Williamson</surname> <given-names>DF</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>TY</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>RJ</given-names>
</name>
<name>
<surname>Barbieri</surname> <given-names>M</given-names>
</name>
<name>
<surname>Mahmood</surname> <given-names>F</given-names>
</name>
</person-group>. <article-title>Data-efficient and weakly supervised computational pathology on whole-slide images</article-title>. <source>Nat Biomed Eng</source>. (<year>2021</year>) <volume>5</volume>:<page-range>555&#x2013;70</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41551-020-00682-w</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vaswani</surname> <given-names>A</given-names>
</name>
<name>
<surname>Shazeer</surname> <given-names>N</given-names>
</name>
<name>
<surname>Parmar</surname> <given-names>N</given-names>
</name>
<name>
<surname>Uszkoreit</surname> <given-names>J</given-names>
</name>
<name>
<surname>Jones</surname> <given-names>L</given-names>
</name>
<name>
<surname>Gomez</surname> <given-names>AN</given-names>
</name>
<etal/>
</person-group>. <article-title>Attention is all you need</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2017</year>) <volume>30</volume>.</citation>
</ref>
<ref id="B17">
<label>17</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>BenTaieb</surname> <given-names>A</given-names>
</name>
<name>
<surname>Hamarneh</surname> <given-names>G</given-names>
</name>
</person-group>. (<year>2018</year>). <article-title>Predicting cancer with a recurrent visual attention model for histopathology images</article-title>, in: <conf-name>Medical Image Computing and Computer Assisted Intervention&#x2013;MICCAI 2018: 21st International Conference, Granada, Spain, September 16-20, 2018, Proceedings, Part II 11 (Springer)</conf-name>, . pp. <page-range>129&#x2013;37</page-range>.</citation>
</ref>
<ref id="B18">
<label>18</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>K</given-names>
</name>
<name>
<surname>Van Arnam</surname> <given-names>J</given-names>
</name>
<name>
<surname>Gupta</surname> <given-names>R</given-names>
</name>
<name>
<surname>Saltz</surname> <given-names>J</given-names>
</name>
<name>
<surname>Vakalopoulou</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A joint spatial and magnification based attention framework for large scale histopathology classification</article-title>, in: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, . pp. <page-range>3776&#x2013;84</page-range>.</citation>
</ref>
<ref id="B19">
<label>19</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname> <given-names>J</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>X</given-names>
</name>
<name>
<surname>Jonnagaddala</surname> <given-names>J</given-names>
</name>
<name>
<surname>Hawkins</surname> <given-names>N</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>Whole slide images based cancer survival prediction using attention guided deep multiple instance learning networks</article-title>. <source>Med Image Anal</source>. (<year>2020</year>) <volume>65</volume>:<fpage>101789</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.media.2020.101789</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>J</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W</given-names>
</name>
<name>
<surname>Sisk</surname> <given-names>A</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>H</given-names>
</name>
<name>
<surname>Wallace</surname> <given-names>WD</given-names>
</name>
<name>
<surname>Speier</surname> <given-names>W</given-names>
</name>
<etal/>
</person-group>. <article-title>A multi-resolution model for histopathology image classification and localization with multiple instance learning</article-title>. <source>Comput Biol Med</source>. (<year>2021</year>) <volume>131</volume>:<fpage>104253</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.104253</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>B</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Eliceiri</surname> <given-names>KW</given-names>
</name>
</person-group>. (<year>2021</year>). <article-title>Dual-stream multiple instance learning network for whole slide image classification with self-supervised contrastive learning</article-title>, in: <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, . pp. <page-range>14318&#x2013;28</page-range>.</citation>
</ref>
<ref id="B22">
<label>22</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>J</given-names>
</name>
<name>
<surname>He</surname> <given-names>X</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>B</given-names>
</name>
</person-group>. <article-title>Multiple instance learning with graph neural networks</article-title>. <source>arXiv preprint arXiv:1906.04881</source>. (<year>2019</year>).</citation>
</ref>
<ref id="B23">
<label>23</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>F</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>N</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Predicting lymph node metastasis using histopathological images based on multiple instance learning with deep graph convolution</article-title>, in: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, . pp. <page-range>4837&#x2013;46</page-range>.</citation>
</ref>
<ref id="B24">
<label>24</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shao</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Bian</surname> <given-names>H</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>X</given-names>
</name>
<etal/>
</person-group>. <article-title>Transmil: Transformer based correlated multiple instance learning for whole slide image classification</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2021</year>) <volume>34</volume>:<page-range>2136&#x2013;47</page-range>.</citation>
</ref>
<ref id="B25">
<label>25</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramachandran</surname> <given-names>P</given-names>
</name>
<name>
<surname>Parmar</surname> <given-names>N</given-names>
</name>
<name>
<surname>Vaswani</surname> <given-names>A</given-names>
</name>
<name>
<surname>Bello</surname> <given-names>I</given-names>
</name>
<name>
<surname>Levskaya</surname> <given-names>A</given-names>
</name>
<name>
<surname>Shlens</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>Stand-alone self-attention in vision models</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2019</year>) <volume>32</volume>.</citation>
</ref>
<ref id="B26">
<label>26</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>M</given-names>
</name>
<name>
<surname>Aftab</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>Study on the prediction method of long-term benign and Malignant pulmonary lesions based on lstm</article-title>. <source>Front Bioengineering Biotechnol</source>. (<year>2022</year>) <volume>10</volume>:<elocation-id>791424</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fbioe.2022.791424</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beltagy</surname> <given-names>I</given-names>
</name>
<name>
<surname>Peters</surname> <given-names>ME</given-names>
</name>
<name>
<surname>Cohan</surname> <given-names>A</given-names>
</name>
</person-group>. <article-title>Longformer: The long-document transformer</article-title>. <source>arXiv preprint arXiv:2004.05150</source>. (<year>2020</year>).</citation>
</ref>
<ref id="B28">
<label>28</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>P</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>X</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>B</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Multi-scale vision longformer: A new vision transformer for high-resolution image encoding</article-title>, in: <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, . pp. <fpage>2998</fpage>&#x2013;<lpage>3008</lpage>.</citation>
</ref>
<ref id="B29">
<label>29</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>, in: <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, . pp. <page-range>10012&#x2013;22</page-range>.</citation>
</ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>W</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>Low-dose computed tomography image reconstruction via a multistage convolutional neural network with autoencoder perceptual loss network</article-title>. <source>Quantitative Imaging Med Surg</source>. (<year>2022</year>
<elocation-id>1929</elocation-id>) <volume>12</volume>. doi: <pub-id pub-id-type="doi">10.21037/qims-21-465</pub-id>
</citation>
</ref>
<ref id="B31">
<label>31</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Vaswani</surname> <given-names>A</given-names>
</name>
<name>
<surname>Ramachandran</surname> <given-names>P</given-names>
</name>
<name>
<surname>Srinivas</surname> <given-names>A</given-names>
</name>
<name>
<surname>Parmar</surname> <given-names>N</given-names>
</name>
<name>
<surname>Hechtman</surname> <given-names>B</given-names>
</name>
<name>
<surname>Shlens</surname> <given-names>J</given-names>
</name>
</person-group>. (<year>2021</year>). <article-title>Scaling local self-attention for parameter efficient visual backbones</article-title>, in: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, . pp. <page-range>12894&#x2013;904</page-range>.</citation>
</ref>
<ref id="B32">
<label>32</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>H</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>C-Y</given-names>
</name>
<name>
<surname>Feichtenhofer</surname> <given-names>C</given-names>
</name>
<name>
<surname>Darrell</surname> <given-names>T</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>S</given-names>
</name>
</person-group>. (<year>2022</year>). <article-title>A convnet for the 2020s</article-title>, in: <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, . pp. <page-range>11976&#x2013;86</page-range>.</citation>
</ref>
<ref id="B33">
<label>33</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>T</given-names>
</name>
<name>
<surname>Kornblith</surname> <given-names>S</given-names>
</name>
<name>
<surname>Norouzi</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hinton</surname> <given-names>G</given-names>
</name>
</person-group>. (<year>2020</year>). <article-title>A simple framework for contrastive learning of visual representations</article-title>, in: <conf-name>International conference on machine learning (PMLR)</conf-name>, . pp. <page-range>1597&#x2013;607</page-range>.</citation>
</ref>
<ref id="B34">
<label>34</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>W</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>E</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>D-P</given-names>
</name>
<name>
<surname>Song</surname> <given-names>K</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>D</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In</article-title>, in: <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, . pp. <page-range>568&#x2013;78</page-range>.</citation>
</ref>
<ref id="B35">
<label>35</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bejnordi</surname> <given-names>BE</given-names>
</name>
<name>
<surname>Veta</surname> <given-names>M</given-names>
</name>
<name>
<surname>Van Diest</surname> <given-names>PJ</given-names>
</name>
<name>
<surname>Van Ginneken</surname> <given-names>B</given-names>
</name>
<name>
<surname>Karssemeijer</surname> <given-names>N</given-names>
</name>
<name>
<surname>Litjens</surname> <given-names>G</given-names>
</name>
<etal/>
</person-group>. <article-title>Diagnostic assessment of deep learning algorithms for detection of lymph node metastases in women with breast cancer</article-title>. <source>Jama</source>. (<year>2017</year>) <volume>318</volume>:<page-range>2199&#x2013;210</page-range>.</citation>
</ref>
<ref id="B36">
<label>36</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aftab</surname> <given-names>R</given-names>
</name>
<name>
<surname>Qiang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Juanjuan</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>Contrastive learning for whole slide image representation: A self-supervised approach in digital pathology</article-title>. <source>Eur J Appl Science Eng Technol</source>. (<year>2024</year>) <volume>2</volume>:<page-range>175&#x2013;85</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.59324/ejaset.2024.2(2)</pub-id>
</citation>
</ref>
<ref id="B37">
<label>37</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Andersson</surname> <given-names>A</given-names>
</name>
<name>
<surname>Koriakina</surname> <given-names>N</given-names>
</name>
<name>
<surname>Sladoje</surname> <given-names>N</given-names>
</name>
<name>
<surname>Lindblad</surname> <given-names>J</given-names>
</name>
</person-group>. (<year>2022</year>). <article-title>End-to-end multiple instance learning with gradient accumulation</article-title>, in: <conf-name>2022 IEEE International Conference on Big Data (Big Data)</conf-name>, . pp. <page-range>2742&#x2013;6</page-range>. <publisher-name>IEEE</publisher-name>.</citation>
</ref>
<ref id="B38">
<label>38</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>H</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Qiao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Coupland</surname> <given-names>SE</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Dtfd-mil: Double-tier feature distillation multiple instance learning for histopathology whole slide image classification</article-title>, in: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, . pp. <page-range>18802&#x2013;12</page-range>.</citation>
</ref>
<ref id="B39">
<label>39</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tourniaire</surname> <given-names>P</given-names>
</name>
<name>
<surname>Ilie</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hofman</surname> <given-names>P</given-names>
</name>
<name>
<surname>Ayache</surname> <given-names>N</given-names>
</name>
<name>
<surname>Delingette</surname> <given-names>H</given-names>
</name>
</person-group>. <article-title>Ms-clam: Mixed supervision for the classification and localization of tumors in whole slide images</article-title>. <source>Med Image Anal</source>. (<year>2023</year>) <volume>85</volume>:<fpage>102763</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.media.2023.102763</pub-id>
</citation>
</ref>
<ref id="B40">
<label>40</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>J</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>W</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>W</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L</given-names>
</name>
<name>
<surname>Zhan</surname> <given-names>X</given-names>
</name>
<etal/>
</person-group>. <article-title>Computational image analysis identifies histopathological image features associated with somatic mutations and patient survival in gastric adenocarcinoma</article-title>. <source>Front Oncol</source>. (<year>2021</year>) <volume>11</volume>:<elocation-id>623382</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fonc.2021.623382</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>