<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<?covid-19-tdm?>
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2024.1360143</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Medicine</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Contrastive learning with token projection for Omicron pneumonia identification from few-shot chest CT images</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Jiang</surname> <given-names>Xiaoben</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2248808/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Yang</surname> <given-names>Dawei</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1876839/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Feng</surname> <given-names>Li</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhu</surname> <given-names>Yu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Mingliang</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1656947/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Feng</surname> <given-names>Yinzhou</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2421656/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Bai</surname> <given-names>Chunxue</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1865449/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Fang</surname> <given-names>Hao</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2178497/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>School of Information Science and Technology, East China University of Science and Technology</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Pulmonary and Critical Care Medicine, Zhongshan Hospital, Fudan University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Shanghai Engineering Research Center of Internet of Things for Respiratory Medicine</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Pulmonary and Critical Care Medicine, Zhongshan Hospital (Xiamen), Fudan University</institution>, <addr-line>Xiamen, Fujian</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>Department of Nursing, Zhongshan Hospital, Fudan University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff6"><sup>6</sup><institution>Department of Anesthesiology, Zhongshan Hospital, Fudan University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff7"><sup>7</sup><institution>Department of Anesthesiology, Shanghai Geriatric Medical Center</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0002">
<p>Edited by: Michalis Savelonas, University of Thessaly, Greece</p>
</fn>
<fn fn-type="edited-by" id="fn0003">
<p>Reviewed by: Carl-Magnus Svensson, Leibniz Institute for Natural Product Research and Infection Biology, Germany</p>
<p>Nabil Ibtehaz, Purdue University, United States</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Yu Zhu, <email>zhuyu@ecust.edu.cn</email>; Hao Fang, <email>drfanghao@163.com</email></corresp>
<fn fn-type="equal" id="fn0001">
<p><sup>&#x2020;</sup>These authors have contributed equally to this work and share first authorship</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>02</day>
<month>05</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1360143</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>12</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>04</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 Jiang, Yang, Feng, Zhu, Wang, Feng, Bai and Fang.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Jiang, Yang, Feng, Zhu, Wang, Feng, Bai and Fang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec id="sec1">
<title>Introduction</title>
<p>Deep learning-based methods can promote and save critical time for the diagnosis of pneumonia from computed tomography (CT) images of the chest, where the methods usually rely on large amounts of labeled data to learn good visual representations. However, medical images are difficult to obtain and need to be labeled by professional radiologists.</p>
</sec>
<sec id="sec2">
<title>Methods</title>
<p>To address this issue, a novel contrastive learning model with token projection, namely CoTP, is proposed for improving the diagnostic quality of few-shot chest CT images. Specifically, (1) we utilize solely unlabeled data for fitting CoTP, along with a small number of labeled samples for fine-tuning, (2) we present a new Omicron dataset and modify the data augmentation strategy, i.e., random Poisson noise perturbation for the CT interpretation task, and (3) token projection is utilized to further improve the quality of the global visual representations.</p>
</sec>
<sec id="sec3">
<title>Results</title>
<p>The ResNet50 pre-trained by CoTP attained accuracy (ACC) of 92.35%, sensitivity (SEN) of 92.96%, precision (PRE) of 91.54%, and the area under the receiver-operating characteristics curve (AUC) of 98.90% on the presented Omicron dataset. On the contrary, the ResNet50 without pre-training achieved ACC, SEN, PRE, and AUC of 77.61, 77.90, 76.69, and 85.66%, respectively.</p>
</sec>
<sec id="sec4">
<title>Conclusion</title>
<p>Extensive experiments reveal that a model pre-trained by CoTP greatly outperforms that without pre-training. The CoTP can improve the efficacy of diagnosis and reduce the heavy workload of radiologists for screening of Omicron pneumonia.</p>
</sec>
</abstract>
<kwd-group>
<kwd>contrastive learning</kwd>
<kwd>token projection</kwd>
<kwd>omicron pneumonia identification</kwd>
<kwd>random Poisson noise perturbation</kwd>
<kwd>chest CT images</kwd>
</kwd-group>
<counts>
<fig-count count="14"/>
<table-count count="11"/>
<equation-count count="9"/>
<ref-count count="52"/>
<page-count count="15"/>
<word-count count="8684"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Pulmonary Medicine</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec5">
<label>1</label>
<title>Introduction</title>
<p>In the tail of February 2022, a new round of COVID-19 epidemic caused by subvariant Omicron BA. 2 and BA. 2.2 broke out in Shanghai (<xref ref-type="bibr" rid="ref1">1</xref>). There are more than 30 mutation sites in the spike protein of the Omicron mutant, which increases the binding ability of the virus to human cells, and the infectivity is 37.5% higher than that of the Delta variant (<xref ref-type="bibr" rid="ref2">2</xref>, <xref ref-type="bibr" rid="ref3">3</xref>). Until 1 June 2022, Omicron had caused 626,811 infection cases, including 568,811 asymptomatic infections, 58,000 symptomatic cases, and 588 deaths (<xref ref-type="bibr" rid="ref4">4</xref>), which brought great crisis and challenge to social public health security (<xref ref-type="bibr" rid="ref5">5</xref>).</p>
<p>Currently, the real-time reverse-transcriptase&#x2013;polymerase-chain-reaction (RT-PCR) test is the main diagnostic tool (<xref ref-type="bibr" rid="ref6">6</xref>), while chest CT imaging is increasingly recognized as a complementary or even a reliable alternative method (<xref ref-type="bibr" rid="ref7">7</xref>, <xref ref-type="bibr" rid="ref8">8</xref>). <xref ref-type="fig" rid="fig1">Figure 1</xref> illustrates some CT scan images of mild and severe Omicron pneumonia. All annotations have been provided by experienced doctors, who evaluate patients based on their clinical conditions and CT imaging. From that, we can find that the CT images of mild Omicron pneumonia usually show slight inflammation in the lungs. On the contrary, the CT images of severe Omicron pneumonia are more serious than those of mild Omicron pneumonia, showing more severe inflammation and damage to the lungs. Physicians need to pay more attention to patients with severe Omicron pneumonia and treat them in time. However, experienced radiologists are needed to manually identify all the thin-slice CT images (an average of 300 layers per patient) (<xref ref-type="bibr" rid="ref9">9</xref>). This may lead to misdiagnosis due to the significantly increased workload of radiologists.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>CT scan images of mild and severe Omicron pneumonia. Severe Omicron pneumonia areas are marked with red dotted circles. <bold>(A)</bold> Mild Omicron pneumonia. <bold>(B)</bold> Severe Omicron pneumonia.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g001.tif"/>
</fig>
<p>With the development of deep learning (<xref ref-type="bibr" rid="ref10">10</xref>), researchers can extract useful information from a significant volume of annotated data (<xref ref-type="bibr" rid="ref11">11</xref>). However, when compared to natural images, acquiring such quantities of medical data is challenging, and the annotations must be carried out by professional radiologists (<xref ref-type="bibr" rid="ref12">12</xref>, <xref ref-type="bibr" rid="ref13">13</xref>). This poses huge challenges to applying deep learning to medical image analysis and processing (<xref ref-type="bibr" rid="ref14">14</xref>). In recent years, contrastive learning methods (<xref ref-type="bibr" rid="ref15 ref16 ref17 ref18 ref19">15&#x2013;19</xref>) have achieved satisfactory results in natural image classification tasks. These methods can utilize unlabeled data to create a pre-trained model, which can then be fine-tuned with lightly annotated data for further improvement.</p>
<p>While some studies have investigated the effects of contrastive learning on natural image classification tasks, there remains a gap in research that specifically addresses chest CT images. The current methods based on contrast learning are insufficient in enhancing chest CT images effectively and exploring global features. To address this issue, we propose a novel contrastive learning with token projection, namely CoTP, to improve global visual representation. The token projection typically consists of a multi-head self-attention (MHSA) (<xref ref-type="bibr" rid="ref20">20</xref>) and a fully connected (FC) layer. The MHSA can capture short and long-range visual dependencies, while the FC layer can eliminate redundant features. Moreover, we leverage the downsampling layer to reduce the cost of computation. In addition, a private Omicron dataset collected by the Geriatric Medical Center, Zhongshan Hospital, Fudan University is utilized for CoTP pre-training. Especially, data augmentations have important roles in contrastive learning methods (<xref ref-type="bibr" rid="ref15">15</xref>). However, the widely used augmentations in contrastive learning approaches for natural images may not be suitable for chest CT. Therefore, a new data augmentation approach, random Poisson noise perturbation (PNP) is proposed for CT images, to simulate the noise in CT images. After pre-training, the feature encoder with pre-trained weights is taken out, followed by a simple max pooling and average pooling (MAP) head which can obtain different space areas occupied by objects of different categories. Then, we fine-tune the model on a sub-dataset extracted from Omicron datasets and the external SARS-CoV-2 CT-scan dataset (<xref ref-type="bibr" rid="ref21">21</xref>), respectively. Extensive experiments reveal that a model pre-trained by the proposed CoTP greatly outperforms that without pre-training.</p>
<p>Our main contributions to this work are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>A novel contrastive learning with token projection, namely CoTP, is proposed to improve the diagnostic quality of few-shot Omicron chest CT images. In particular, token projection with a downsampling layer is utilized to further improve the quality of the global visual representations and reduce the computational cost. In addition, the MAP head is employed to obtain different spatial regions occupied by objects of different categories.</p>
</list-item>
<list-item>
<p>We present a new Omicron dataset approved by the institutional review board of Zhongshan Hospital, Fudan University in Shanghai. Furthermore, we leverage a new data augmentation approach, random Poisson noise perturbation (PNP) to simulate the noise in CT images which is more realistic.</p>
</list-item>
<list-item>
<p>We verify the effectiveness of the proposed CoTP on the private Omicron dataset and the external SARS-CoV-2 CT-scan dataset, which delivers promising results on both datasets.</p>
</list-item>
</list>
</sec>
<sec id="sec6">
<label>2</label>
<title>Related work</title>
<sec id="sec7">
<label>2.1</label>
<title>Supervised learning for diagnosis of pneumonia from chest CT images</title>
<p>Since the outbreak of coronavirus disease COVID-19 was declared a pandemic by the WHO on 11 March, 2020, various deep learning-based methods have been implemented worldwide to promote and save critical time for pneumonia diagnosis from CT images. Wu et al. (<xref ref-type="bibr" rid="ref9">9</xref>) proposed a multi-view fusion model to improve the efficacy of diagnosis. A previous study Mei et al. (<xref ref-type="bibr" rid="ref22">22</xref>) designed a grad-CAM-based deep learning method for fast detection of COVID-19 cases. Another study (<xref ref-type="bibr" rid="ref23">23</xref>) diagnosed COVID-19 via the proposed network with a multi-receptive field attention module on CT images. Moreover, Mei et al. (<xref ref-type="bibr" rid="ref22">22</xref>) adopted ResNet (<xref ref-type="bibr" rid="ref23">23</xref>) to rapidly diagnose COVID-19 patients using both full CT scans and non-image information. In addition, several works (<xref ref-type="bibr" rid="ref24 ref25 ref26">24&#x2013;26</xref>) also used segmentation techniques for detection. However, the current deep learning-based approaches for pneumonia diagnosis primarily rely on supervised learning, leveraging abundant labeled data to acquire precise visual representations. On the contrary, there are few-shot labeled chest CT images. <xref ref-type="table" rid="tab1">Table 1</xref> summarizes the previous studies on supervised learning for pneumonia diagnosis from chest CT images.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Previous studies on supervised learning for pneumonia diagnosis from Chest CT images.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Authors</th>
<th align="center" valign="top">Year published</th>
<th align="left" valign="top">Pros.</th>
<th align="left" valign="top">Cons.</th>
<th align="left" valign="top">Results</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Wu et al. (<xref ref-type="bibr" rid="ref9">9</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">Axial, coronal, and sagittal views of each chest CT image are selected as the inputs of the deep learning network.</td>
<td align="left" valign="middle">Subgroup analysis was limited by the unavailability of detailed clinical information.</td>
<td align="left" valign="middle">81.9% AUC on CT images dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Panwar et al. (<xref ref-type="bibr" rid="ref27">27</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">Grad-CAM-based color visualization approach and early stopping.</td>
<td align="left" valign="middle">Lack of ground truth boxes to detect lesions.</td>
<td align="left" valign="middle">95% ACC on the SARS-COV-2 CT-scan dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Mei et al. (<xref ref-type="bibr" rid="ref22">22</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">Demographic and clinical data are also integrated by an MLP network to rapidly diagnose patients.</td>
<td align="left" valign="middle">The study has a small sample size.</td>
<td align="left" valign="middle">92% AUC on the COVID-19 dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Chen et al. (<xref ref-type="bibr" rid="ref24">24</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">Performing both classification and detection tasks simultaneously.</td>
<td align="left" valign="middle">The inference time is slow</td>
<td align="left" valign="middle">98.85% ACC in the internal retrospective dataset</td>
</tr>
<tr>
<td align="left" valign="middle">Wang et al. (<xref ref-type="bibr" rid="ref26">26</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">A novel noise-robust Dice loss function, adaptive teacher and student mechanisms.</td>
<td align="left" valign="middle">Incorrect predictions tend to be related to noisy labels.</td>
<td align="left" valign="middle">80.29% Dice on the COVID-19 pneumonia dataset</td>
</tr>
<tr>
<td align="left" valign="middle">Ma et al. (<xref ref-type="bibr" rid="ref28">28</xref>)</td>
<td align="center" valign="middle">2021</td>
<td align="left" valign="middle">Multi-receptive field attention module.</td>
<td align="left" valign="middle">Lack of ground truth boxes to detect lesions.</td>
<td align="left" valign="middle">99.01% AUC on the SARS-COV-2 CT-scan dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Qiu et al. (<xref ref-type="bibr" rid="ref25">25</xref>)</td>
<td align="center" valign="middle">2021</td>
<td align="left" valign="middle">Attentive Hierarchical Spatial Pyramid module and lightweight multi-scale learning.</td>
<td align="left" valign="middle">Require a large amount of labeled data.</td>
<td align="left" valign="middle">75.91% Dice on the COVID-19-CT dataset.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec8">
<label>2.2</label>
<title>Contrastive learning in image analysis</title>
<p>Given the efficient visual representation ability of deep learning, contrastive learning has emerged as a promising approach for efficiently extracting accurate visual representations from unlabeled images (<xref ref-type="bibr" rid="ref29">29</xref>). Wu et al. (<xref ref-type="bibr" rid="ref16">16</xref>) first designed a framework that pulls away augmented views of different images (negative pair) while pulling in different augmented views of the same image (positive pair). Based on this idea, the two methods, SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>) and MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>) were proposed, which can greatly narrow the gap between supervised learning and unsupervised learning on downstream task performance. These methods, SimCLRv2 (<xref ref-type="bibr" rid="ref17">17</xref>) and MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>) employed the projection head to improve the ability of visual representation extraction and outperformed the supervised learning on downstream tasks.</p>
<p>The success of these methods motivated many researchers to introduce contrastive learning into medical image analysis. Sowrirajan et al. (<xref ref-type="bibr" rid="ref13">13</xref>) utilized MoCo pre-training to improve the representation and transferability of chest X-ray Models. Zhang et al. (<xref ref-type="bibr" rid="ref30">30</xref>) obtained medical visual representations according to contrastive learning with paired images and texts. In addition, the works of various researchers (<xref ref-type="bibr" rid="ref30 ref31 ref32">30&#x2013;32</xref>) employed contrastive learning for medical image segmentation. However, the existing contrastive mechanisms have scope for improvement for Omicron pneumonia diagnosis from chest CT images due to their inability to mine global features and lack of appropriate augmentations for chest CT images. We present the pros and cons of previous studies on contrastive learning in image analysis in <xref ref-type="table" rid="tab2">Table 2</xref>.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Previous studies on contrastive learning in image analysis.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Authors</th>
<th align="center" valign="top">Year published</th>
<th align="left" valign="top">Pros.</th>
<th align="left" valign="top">Cons.</th>
<th align="left" valign="top">Results</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Wu et al. (<xref ref-type="bibr" rid="ref16">16</xref>)</td>
<td align="center" valign="middle">2018</td>
<td align="left" valign="middle">Maximize the distinction between instances and non-parametric instance discrimination.</td>
<td align="left" valign="middle">Compared to supervised models, it still has a significant gap.</td>
<td align="left" valign="middle">54.0% ACC on ImageNet dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Chen et al. (<xref ref-type="bibr" rid="ref15">15</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">A learnable nonlinear transformation.</td>
<td align="left" valign="middle">Require a huge batch size of 4,096.</td>
<td align="left" valign="middle">61.9% ACC on ImageNet dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">He et al. (<xref ref-type="bibr" rid="ref18">18</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">A dynamic dictionary with a queue and a moving-averaged encoder.</td>
<td align="left" valign="middle">Requires a large number of negative samples as a queue.</td>
<td align="left" valign="middle">60.6% ACC on ImageNet dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Chen et al. (<xref ref-type="bibr" rid="ref17">17</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">Unlabeled examples for refining and transferring the task-specific knowledge.</td>
<td align="left" valign="middle">Require a huge batch size of 4,096.</td>
<td align="left" valign="middle">66.6% ACC on ImageNet dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Chen et al. (<xref ref-type="bibr" rid="ref19">19</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">A learnable nonlinear transformation is added in MoCo-v1.</td>
<td align="left" valign="middle">Requires a large number of negative samples as a queue.</td>
<td align="left" valign="middle">67.5% ACC on ImageNet dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Zhang et al. (<xref ref-type="bibr" rid="ref29">29</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">Exploite naturally occurring paired descriptive text.</td>
<td align="left" valign="middle">Require a large amount of text annotations.</td>
<td align="left" valign="middle">91.2% ACC on the NCT-CRC-HE-100&#x2009;K dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Chaitanya et al. (<xref ref-type="bibr" rid="ref31">31</xref>)</td>
<td align="center" valign="middle">2020</td>
<td align="left" valign="middle">Domain-specific contrasting strategies and local version of contrastive loss.</td>
<td align="left" valign="middle">The computational complexity is heavy.</td>
<td align="left" valign="middle">88.6% Dice on the ACDC dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Sowrirajan et al. (<xref ref-type="bibr" rid="ref13">13</xref>)</td>
<td align="center" valign="middle">2021</td>
<td align="left" valign="middle">MoCo-CXR Pre-training for chest X-ray Interpretation.</td>
<td align="left" valign="middle">Lack of effective data augmentation.</td>
<td align="left" valign="middle">81.3% AUC on the CheXpert dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Zeng et al. (<xref ref-type="bibr" rid="ref32">32</xref>)</td>
<td align="center" valign="middle">2021</td>
<td align="left" valign="middle">Generate contrastive data pairs based on the position of a slice in volumetric medical images.</td>
<td align="left" valign="middle">lack of appropriate augmentations for medical images.</td>
<td align="left" valign="middle">92.9% Dice on the ACDC dataset.</td>
</tr>
<tr>
<td align="left" valign="middle">Wu et al. (<xref ref-type="bibr" rid="ref33">33</xref>)</td>
<td align="center" valign="middle">2022</td>
<td align="left" valign="middle">The proposed network does not rely on large negative samples.</td>
<td align="left" valign="middle">Lack of global visual representation.</td>
<td align="left" valign="middle">89.4% Dice on the ACDC dataset.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="materials|methods" id="sec9">
<label>3</label>
<title>Materials and methods</title>
<sec id="sec10">
<label>3.1</label>
<title>The pipeline for interpretation of CT images</title>
<p>The overall pipeline for CoTP pre-training and the subsequent fine-tuning with CT images are illustrated in <xref ref-type="fig" rid="fig2">Figure 2</xref>. There are two stages for CT image interpretation: the CoTP pre-training stage and subsequent fine-tuning. First, we converted and exported the DICOM files of Omicron patients into JPEG formats and employed CoTP to pre-train the feature encoder using unlabeled Omicron CT images. Second, the feature encoder with pre-trained weights was taken out, followed by a simple linear classifier. Then, we fine-tuned the baseline with a few labeled CT images.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Overview of Omicron pneumonia.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g002.tif"/>
</fig>
</sec>
<sec id="sec11">
<label>3.2</label>
<title>Random Poisson noise perturbation</title>
<p>Data augmentation is widely used in contrastive learning and is crucial for learning good representations (<xref ref-type="bibr" rid="ref15">15</xref>). Nevertheless, most existing natural image data augmentations may not be suitable for chest CT images. For example, random crops and cutouts may remove or mask the lesion area of CT images. Meanwhile, color jitter and random grayscale transformation are no longer applicable to grayscale CT images.</p>
<p>As shown in <xref ref-type="fig" rid="fig3">Figure 3</xref>, we not only utilized traditional methods, i.e., random horizontal flipping, random center crop, and random rotation (10 degrees) but also a new data augmentation approach, random Poisson noise perturbation for CT images. Poisson distributed noise is a well-known data augmentation (<xref ref-type="bibr" rid="ref34">34</xref>, <xref ref-type="bibr" rid="ref35">35</xref>). However, this was the first time that Poisson-distributed noise was applied to contrastive learning instead of Gaussian noise perturbation. In the process of scanning CT images, various noises will be generated due to the photoelectric interaction, and the noise distribution is more accurately characterized by the Poisson distribution (<xref ref-type="bibr" rid="ref36">36</xref>). Consequently, we employed random Poisson noise perturbation to simulate the noise in CT images as a new data augmentation for CT images. First, we performed fan beam projection (<xref ref-type="bibr" rid="ref37">37</xref>) transformation on the CT image <inline-formula>
<mml:math id="M1">
<mml:mi>X</mml:mi>
</mml:math>
</inline-formula>, and added Poisson noise as <xref ref-type="disp-formula" rid="EQ9">Eq. (1)</xref>, where <inline-formula>
<mml:math id="M2">
<mml:mi>b</mml:mi>
</mml:math>
</inline-formula> stands for the number of photons. Here <inline-formula>
<mml:math id="M3">
<mml:mi>b</mml:mi>
</mml:math>
</inline-formula> is set as le-6.</p>
<disp-formula id="EQ9">
<label>(1)</label>
<mml:math id="M4">
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">Poisson</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x22C5;</mml:mo>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="italic">Fanbeam</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>X</mml:mi>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
</disp-formula>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Illustrations of a series of CT augmentation methods, i.e., random horizontal flipping, random center crop, random rotation (10 degrees), and the proposed random Poisson noise perturbation.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g003.tif"/>
</fig>
<p>Then, <inline-formula>
<mml:math id="M5">
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> has to be processed with a logarithm and transform (iFanbeam) from the classical filtered back-projection (FBP) algorithm (<xref ref-type="bibr" rid="ref38">38</xref>) to the image domain <inline-formula>
<mml:math id="M6">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:math>
</inline-formula>, as <xref ref-type="disp-formula" rid="EQ2">Eq. (2)</xref>. Thus, we gained the Poisson noise CT image according to Eqs. (1, 2) as a more realistic data augmentation.</p>
<disp-formula id="E1"><label>(2)</label> <mml:math id="M7">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">iFanbeam</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>ln</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo stretchy="true">/</mml:mo>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math></disp-formula>
</sec>
<sec id="sec12">
<label>3.3</label>
<title>Overview of the proposed CoTP</title>
<sec id="sec13">
<label>Algorithm 1</label>
<title>Pseudocode of CoTP.</title>
<table-wrap position="anchor" id="tab3">
<table frame="hsides" rules="groups">
<tbody>
<tr>
<td align="left" valign="top"><bold>Input:</bold> batch size <inline-formula>
<mml:math id="M9">
<mml:mi>B</mml:mi>
</mml:math>
</inline-formula>, constant temperature <inline-formula>
<mml:math id="M10">
<mml:mi>&#x03C4;</mml:mi>
</mml:math>
</inline-formula>, negative memory bank <inline-formula>
<mml:math id="M11">
<mml:mi>N</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x00B7;</mml:mo>
<mml:mo>&#x22EF;</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>, <break/>encoder networks for query and key <inline-formula>
<mml:math id="M12">
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M13">
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>, Token projection for query and key<inline-formula>
<mml:math id="M14">
<mml:mi>T</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M15">
<mml:mi>T</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>,<break/><bold>for</bold> sampled minibatch <inline-formula>
<mml:math id="M16">
<mml:msubsup>
<mml:mfenced open="{" close="}">
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:mfenced>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>B</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula>do<break/><bold>for</bold> all <inline-formula>
<mml:math id="M17">
<mml:mi>q</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}" separators=",,">
<mml:mn>1</mml:mn>
<mml:mo>&#x2026;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula><bold>do</bold><break/>draw two augmentation functions CTAug1, CTAug2<break/># augmentation for query<break/><inline-formula>
<mml:math id="M18">
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>q</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula> # encoder<break/><inline-formula>
<mml:math id="M19">
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:math>
</inline-formula> # Token projection<break/># augmentation for key<break/><inline-formula>
<mml:math id="M20">
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>q</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula> # encoder<break/><inline-formula>
<mml:math id="M21">
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:math>
</inline-formula> # Token projection<break/><bold>end for</bold><break/><bold>define</bold> <inline-formula>
<mml:math id="M22">
<mml:mi>L</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mo>log</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mfrac>
<mml:mrow>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo stretchy="true">/</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo stretchy="true">/</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>+</mml:mo>
<mml:msubsup>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">/</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mfenced>
</mml:math>
</inline-formula><break/>update networks <inline-formula>
<mml:math id="M23">
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M24">
<mml:mi>T</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula> to minimize <inline-formula>
<mml:math id="M25">
<mml:mi>L</mml:mi>
</mml:math>
</inline-formula><break/><bold>define</bold> momentum update: <inline-formula>
<mml:math id="M26">
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>m</mml:mi>
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:math>
</inline-formula><break/>update networks <inline-formula>
<mml:math id="M27">
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M28">
<mml:mi>T</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> by momentum update<break/><bold>end for</bold><break/><bold>update</bold> negative memory bank<break/><bold>return</bold> encoder network <inline-formula>
<mml:math id="M29">
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula>, and throw away <inline-formula>
<mml:math id="M30">
<mml:mi>E</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Algorithm 1 summarizes the proposed CoTP.</p>
</sec>
<sec id="sec14">
<label>3.3.1</label>
<title>Feature encoder</title>
<p>As shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>, we designed CoTP to learn global visual representations effectively from unlabeled CT images. Given a CT image, <inline-formula>
<mml:math id="M31">
<mml:mi>X</mml:mi>
</mml:math>
</inline-formula>, we utilized two different augmentations to create two views of the same example, <inline-formula>
<mml:math id="M32">
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M33">
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>. Then, we employed ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>) which removed the entire global pooling and Multilayer Perceptron (MLP) parts as the feature encoder. The <inline-formula>
<mml:math id="M34">
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M35">
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> are mapped via encoders (<italic>q</italic>) and (<italic>k</italic>), to generate visual representations <inline-formula>
<mml:math id="M36">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M37">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, respectively. Here, <inline-formula>
<mml:math id="M38">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M39">
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula>, and<inline-formula>
<mml:math id="M40">
<mml:mi>C</mml:mi>
</mml:math>
</inline-formula> are the length, width, and dimension of the feature map. The pseudocode of CoTP is shown in <xref ref-type="sec" rid="sec13">Algorithm 1</xref>.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>The overall architecture of the proposed CoTP.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g004.tif"/>
</fig>
</sec>
<sec id="sec15">
<label>3.3.2</label>
<title>Token projection</title>
<p>Traditional contrastive learning (<xref ref-type="bibr" rid="ref17">17</xref>, <xref ref-type="bibr" rid="ref19">19</xref>) typically uses a global pooling operation and an MLP as a projection head to improve the visual representations. Innovatively, we designed a token projection instead of a traditional projection head, as shown in <xref ref-type="fig" rid="fig5">Figure 5</xref>.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>The pipeline of token projection. The downsampling layer is designed for the reduction of calculation by reducing the resolution of feature.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g005.tif"/>
</fig>
<p>To begin with, we reshaped the feature <inline-formula>
<mml:math id="M41">
<mml:mi>F</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> to <inline-formula>
<mml:math id="M42">
<mml:mi>T</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. Then, <inline-formula>
<mml:math id="M43">
<mml:mi>T</mml:mi>
</mml:math>
</inline-formula> was passed through three different linear projections and yielded a query <inline-formula>
<mml:math id="M44">
<mml:mi>Q</mml:mi>
</mml:math>
</inline-formula>, a key <inline-formula>
<mml:math id="M45">
<mml:mi>K</mml:mi>
</mml:math>
</inline-formula>, and a value <inline-formula>
<mml:math id="M46">
<mml:mi>V</mml:mi>
</mml:math>
</inline-formula>. Furthermore, we performed average pooling with a pooling size of <inline-formula>
<mml:math id="M47">
<mml:mi>S</mml:mi>
</mml:math>
</inline-formula> for <inline-formula>
<mml:math id="M48">
<mml:mi>K</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M49">
<mml:mi>V</mml:mi>
</mml:math>
</inline-formula> to reduce the cost of computation. Here, we set <inline-formula>
<mml:math id="M50">
<mml:mi>S</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>7</mml:mn>
</mml:math>
</inline-formula>. After that, a convolution with 1 kernel size and 1 stride size was utilized to fuse the feature. Then, we gained the <inline-formula>
<mml:math id="M51">
<mml:mi>K</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M52">
<mml:mi>V</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> after performing layer normalization (LN) and ReLU activation functions. Afterward, we performed multi-head self-attention (MHSA) on <inline-formula>
<mml:math id="M53">
<mml:mi>Q</mml:mi>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M54">
<mml:mi>K</mml:mi>
</mml:math>
</inline-formula>, and <inline-formula>
<mml:math id="M55">
<mml:mi>V</mml:mi>
</mml:math>
</inline-formula>, as shown in <xref ref-type="disp-formula" rid="EQ2">Eq. (3)</xref>,</p>
<disp-formula id="EQ2"><label>(3)</label> <mml:math id="M56">
<mml:msup>
<mml:mi>T</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">MHSA</mml:mi>
<mml:mfenced open="(" close=")" separators=",,">
<mml:mi>Q</mml:mi>
<mml:mi>K</mml:mi>
<mml:mi>V</mml:mi>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">Softmax</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mfrac>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
<mml:msqrt>
<mml:mi>C</mml:mi>
</mml:msqrt>
</mml:mfrac>
</mml:mfenced>
<mml:mi>V</mml:mi>
</mml:math></disp-formula>
<p>Then, we calculated the mean score of <inline-formula>
<mml:math id="M57">
<mml:msup>
<mml:mi>T</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> along the dimension of the column. Finally, we performed a linear projection to eliminate redundant features and obtain <inline-formula>
<mml:math id="M58">
<mml:mi>Z</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. In particular, we set the dimension <inline-formula>
<mml:math id="M59">
<mml:mi>D</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>128</mml:mn>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="sec16">
<label>3.3.3</label>
<title>Update the weights</title>
<p>To meet a large number of negative sample pairs and reduce the computing cost of Graphics Processing Unit (GPU), a memory bank was used to store the negative samples generated by the encoder (<italic>q</italic>), in advance. Hence, we obtained a set of encoded (<italic>k</italic>) samples <inline-formula>
<mml:math id="M60">
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x00B7;</mml:mo>
<mml:mo>&#x22EF;</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>. Out of all the encoded (<italic>k</italic>) samples in the set <italic>E<sub>k</sub></italic> for each encoded query <inline-formula>
<mml:math id="M61">
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, a single positive key <inline-formula>
<mml:math id="M62">
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> was matched, while the remainder of the keys (negative keys) represented different images. A contrastive loss function is represented in <xref ref-type="disp-formula" rid="EQ3">Eq. (4)</xref> as follows, whose value is low when <inline-formula>
<mml:math id="M63">
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is close to its positive key <inline-formula>
<mml:math id="M64">
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> and moves away from all other encoded (<italic>k</italic>) samples:</p>
<disp-formula id="EQ3">
<label>(4)</label>
<mml:math id="M65">
<mml:mi>L</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mo>log</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mfrac>
<mml:mrow>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo stretchy="true">/</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo stretchy="true">/</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>+</mml:mo>
<mml:msubsup>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">/</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mfenced>
</mml:math>
</disp-formula>
<p>where <italic>&#x03C4;</italic> is a temperature hyper-para (<xref ref-type="bibr" rid="ref16">16</xref>), and the number of negatives <inline-formula>
<mml:math id="M66">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> is set at 32,256. We updated the weights <inline-formula>
<mml:math id="M67">
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> of the encoder (<italic>q</italic>) and token projection (<italic>q</italic>) by back-propagation, while the weights <inline-formula>
<mml:math id="M68">
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> of the encoder (<italic>k</italic>) and token projection (<italic>k</italic>) were updated by momentum update (<xref ref-type="bibr" rid="ref18">18</xref>), as <xref ref-type="disp-formula" rid="EQ4">Eq. (5)</xref>, where <inline-formula>
<mml:math id="M69">
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>is 0.999 to update the weights slowly.</p>
<disp-formula id="EQ4">
<label>(5)</label>
<mml:math id="M70">
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>m</mml:mi>
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mi>&#x03C9;</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:math>
</disp-formula>
</sec>
</sec>
<sec id="sec17">
<label>3.4</label>
<title>Subsequent fine-tuning</title>
<p>After CoTP pre-training, we took out the feature encoder with pre-trained weights, followed by a max pooling and average pooling (MAP) head, as shown in <xref ref-type="fig" rid="fig6">Figure 6</xref>. First, we performed a <inline-formula>
<mml:math id="M71">
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula> convolution and reshaped the feature <inline-formula>
<mml:math id="M72">
<mml:mi>F</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> to <inline-formula>
<mml:math id="M73">
<mml:mi>T</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. Here, <italic>C</italic> denotes the class of categories. Afterward, we calculated the mean score and maximum score of the <italic>T</italic> along the dimension of the column, respectively. Finally, a Hyper-parameter <inline-formula>
<mml:math id="M74">
<mml:mi mathvariant="normal">&#x03BB;</mml:mi>
</mml:math>
</inline-formula> was employed to combine the mean score <inline-formula>
<mml:math id="M75">
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> and maximum score <inline-formula>
<mml:math id="M76">
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, as <xref ref-type="disp-formula" rid="EQ5">Eq. (6)</xref>.</p>
<disp-formula id="EQ5">
<label>(6)</label>
<mml:math id="M77">
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="normal">&#x03BB;</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:math>
</disp-formula>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>The backbone of the subsequent fine-tuning. We leverage max pooling head and average pooling head to improve classification performance.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g006.tif"/>
</fig>
<p>It is noteworthy that max-pooling can be considered as class-specific attention that can attain different space areas occupied by objects of different categories. In particular, we performed a simple cross-entropy loss to fine-tune the baseline with a few-shot labeled CT images.</p>
</sec>
<sec id="sec18">
<label>3.5</label>
<title>Implementation details</title>
<p>As shown in <xref ref-type="table" rid="tab4">Table 3</xref>, we utilized Python 3.7 and Pytorch 1.7.0 with PyCharm as our Integrated Development Environment (IDE), running on a PC equipped with Intel(R) i9-10940X CPU and 4 Nvidia 1,080 Ti GPUs with 48&#x2009;GB memory. At the CoTP pre-training stage, Stochastic Gradient Descent (SGD) was employed as our optimizer, while the weight decay was le-4 and the momentum was 0.9. The mini-batch size refers to the number of examples (data points) that are processed together in one iteration of training in deep learning. Here, we set mini-batch size as 128, and the learning rate is initialized to 0.03. Followed by He et al. (<xref ref-type="bibr" rid="ref18">18</xref>), we train for a total of 200 epochs, and the learning rate multiplied by 0.1 at 120 and 160 epochs.</p>
<table-wrap position="float" id="tab4">
<label>Table 3</label>
<caption>
<p>The working environment.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="center" valign="top" colspan="2">Hardware</th>
<th align="left" valign="top" colspan="2">Software</th>
</tr>
<tr>
<th align="left" valign="top">CPU</th>
<th align="left" valign="top">GPU</th>
<th align="left" valign="top">IDE</th>
<th align="left" valign="top">Framework</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Intel(R) i9-10940X</td>
<td align="left" valign="middle">Nvidia 1,080 Ti (Numbers: 4)</td>
<td align="left" valign="middle">PyCharm</td>
<td align="left" valign="middle">Pytorch 1.7.0</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>At the subsequent fine-tuning stage, we utilized AdmW with le-3 weight decay as the optimizer. The mini-batch size refers to the number of examples (data points) that are processed together in one iteration of training in deep learning. Here, we set mini-batch size as 32, and the learning rate is initialized to le-4.</p>
</sec>
</sec>
<sec sec-type="results" id="sec19">
<label>4</label>
<title>Results</title>
<p>The classification performances of the proposed methods were evaluated in terms of the standard metrics, such as accuracy (ACC), sensitivity (SEN), and precision (PRE) discussed in <xref ref-type="disp-formula" rid="EQ6">Eq. (7)</xref>&#x2013;<xref ref-type="disp-formula" rid="EQ8">(9)</xref>, where <italic>P</italic>, <italic>N</italic>, <italic>TP</italic>, <italic>TN</italic>, and <italic>FP</italic> denote positives, negatives, true positives, true negatives, and false positives, respectively.</p>
<disp-formula id="EQ6">
<label>(7)</label>
<mml:math id="M78">
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">C</mml:mi>
<mml:mi mathvariant="normal">C</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula>
<disp-formula id="EQ7">
<label>(8)</label>
<mml:math id="M79">
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mi>P</mml:mi>
</mml:mfrac>
</mml:math>
</disp-formula>
<disp-formula id="EQ8"><label>(9)</label><mml:math id="M80">
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math></disp-formula>
<p>In addition, the mean AUC (<xref ref-type="bibr" rid="ref39">39</xref>) was employed to evaluate the ability of the model to discriminate between different classes. Furthermore, we also used a non-parametric bootstrap (<xref ref-type="bibr" rid="ref40">40</xref>) to estimate the variability around model performance. We performed a total of 500 bootstrap sampling with 300 cases from the test set.</p>
<sec id="sec20">
<label>4.1</label>
<title>Datasets</title>
<p>The study was approved by Zhongshan Hospital, Fudan University in Shanghai, China. All the chest CT scanning images in the Omicron dataset were selected from a retrospective cohort of adult Omicron patients hospitalized in Shanghai Geriatrics Center from March to July 2022. Chest CT examination was performed as part of the patient&#x2019;s routine clinical care at the time of admission. The eligibility criteria were as follows: (1) having intact basic information to be retrieved (names, gender, ages, diagnosis, and severity), and (2) having CT scanning on admission. Patients with underlying lung diseases such as chronic obstructive pulmonary disease (COPD) and bronchiectasis were excluded. All patient scans were downloaded in the DICOM image format. The thickness of the CT image was 5&#x2009;mm.</p>
<p>The diagnosis and classification of severity were based on the Diagnosis and Treatment Scheme of Pneumonia Caused by Novel Coronavirus of China (the ninth version). (National Health Commission of China. The guidelines for the diagnosis and treatment of new coronavirus pneumonia (version 9). Accessed July 25, 2023 <ext-link xlink:href="https://www.gov.cn/xinwen/2022-06/28/content_5698168.htm" ext-link-type="uri">https://www.gov.cn/xinwen/2022-06/28/content_5698168.htm</ext-link>). Adults were considered severe Omicron pneumonia if they met any of the following criteria: (1) tachypnea with a respiratory rate&#x2009;&#x2265;&#x2009;30 breaths/min; (2) oxygen saturation (at rest)&#x2009;&#x2264;&#x2009;93%; (3) PaO2/FiO2&#x2009;&#x2264;&#x2009;300&#x2009;mmHg; (4) radiographic progression of more than 50% of the lesion over 24&#x2013;48&#x2009;h; or (5) respiratory failure, shock, or other organ failures.</p>
<p>Following the above standards, we retrospectively collected high-resolution CT images of 73 patients with mild Omicron pneumonia and 56 patients with severe Omicron pneumonia. The Omicron dataset and demographic characteristics of patients are detailed in <xref ref-type="table" rid="tab5">Table 4</xref>. Initially, we converted and exported the DICOM files of Omicron patients into JPEG formats with 1,500 HU window width and 750 HU window level. After that, we obtained 50,500 unlabeled CT images with the size of <inline-formula>
<mml:math id="M81">
<mml:mn>224</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>224</mml:mn>
</mml:math>
</inline-formula> for CoTP pre-training.</p>
<table-wrap position="float" id="tab5">
<label>Table 4</label>
<caption>
<p>Demographics and baseline characteristics of patients in the Omicron dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th align="center" valign="top" colspan="2">Age</th>
<th align="center" valign="top" colspan="2">Gender</th>
</tr>
<tr>
<th/>
<th align="center" valign="top">&#x003C; 60&#x2009;years (16&#x2013;58)</th>
<th align="center" valign="top">&#x2265; 60&#x2009;years (60&#x2013;96)</th>
<th align="center" valign="top">Male</th>
<th align="center" valign="top">Female</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Mild Omicron pneumonia</td>
<td align="center" valign="middle">28</td>
<td align="center" valign="middle">45</td>
<td align="center" valign="middle">38</td>
<td align="center" valign="middle">35</td>
</tr>
<tr>
<td align="left" valign="top">Severe Omicron pneumonia</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">53</td>
<td align="center" valign="middle">34</td>
<td align="center" valign="middle">22</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Two experienced radiologists were selected, and they first labeled 2,742 CT images from the Omicron dataset. Then, we used the remaining data for CoTP pre-training. Note that the labeled CT images were excluded from the CoTP pre-training. The distribution of training and testing set in the Omicron dataset is shown in <xref ref-type="table" rid="tab6">Table 5</xref>. In addition, we utilize the external SARS-CoV-2 CT-scan dataset presented by Soares et al. (<xref ref-type="bibr" rid="ref21">21</xref>) to evaluate the transferability of CoTP. As shown in <xref ref-type="fig" rid="fig7">Figure 7</xref>, 1,252 CT scans were positive for SARS-CoV-2 infection (COVID-19), while 1,229 CT scanning for patients non-infected by SARS-CoV-2.</p>
<table-wrap position="float" id="tab6">
<label>Table 5</label>
<caption>
<p>Distribution of training and testing set in Omicron dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th align="center" valign="top">Mild Omicron pneumonia (n images)</th>
<th align="center" valign="top">Severe Omicron pneumonia (n images)</th>
<th align="center" valign="top">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Pre-training</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">129 (50, 500)</td>
</tr>
<tr>
<td align="left" valign="top">Training set</td>
<td align="center" valign="top">58 (1302)</td>
<td align="center" valign="top">45 (904)</td>
<td align="center" valign="top">103 (2206)</td>
</tr>
<tr>
<td align="left" valign="top">Testing set</td>
<td align="center" valign="top">15 (330)</td>
<td align="center" valign="top">11 (206)</td>
<td align="center" valign="top">26 (536)</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Distribution of training and testing set in SARS-CoV-2 CT-scan dataset.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g007.tif"/>
</fig>
</sec>
<sec id="sec21">
<label>4.2</label>
<title>Transfer performance of CoTP representations for omicron pneumonia diagnosis</title>
<p>To assess the effectiveness of the visual representations extracted by CoTP, we employed VGG16 (<xref ref-type="bibr" rid="ref41">41</xref>), DenseNet121 (<xref ref-type="bibr" rid="ref42">42</xref>), and ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>), as our backbones and selected six types of pre-training methods for comparison. As depicted in <xref ref-type="table" rid="tab7">Table 6</xref>, the non-pre-training method&#x2019;s weights were randomly initialized, while the supervised pre-training method underwent pre-training on ImageNet-1k (<xref ref-type="bibr" rid="ref43">43</xref>). In addition, we presented a more comprehensive comparison with the existing contrastive methods, such as SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>), MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>), SimCLRv2 (<xref ref-type="bibr" rid="ref17">17</xref>), and MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>) to prove the effectiveness of the proposed CoTP method. We evaluated the classification performance of the model using mean AUC, accuracy (ACC), sensitivity (SEN), and precision (PRE) of each infection type. Based on <xref ref-type="table" rid="tab7">Table 6</xref> and <xref ref-type="fig" rid="fig8">Figure 8</xref> we gained the following observations: (1) Pre-training method plays an important role in improving model performance. The ResNet50 with supervised learning on ImageNet-1&#x2009;k can achieve more 8.02% ACC, 10.43% SEN, and 9.71% PRE than that without pre-training. (2) Our CoTP pre-training method outperforms the supervised method and the contrastive learning methods, which gains 83.54, 91.32, and 92.35% ACC by VGG16, DenseNet121, and ResNet50, respectively. (3) Our CoTP achieves more 8.07, 7.37, 4.11 and 2.56% AUC than the SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>), MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>), SimCLRv2 (<xref ref-type="bibr" rid="ref17">17</xref>), and MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>) by ResNet50, respectively.</p>
<table-wrap position="float" id="tab7">
<label>Table 6</label>
<caption>
<p>The transfer results of Omicron pneumonia diagnosis.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Architectures</th>
<th align="center" valign="top" colspan="2">Pre-training</th>
<th align="center" valign="top" rowspan="2">ACC (%)</th>
<th align="center" valign="top" rowspan="2">SEN (%)</th>
<th align="center" valign="top" rowspan="2">PRE(%)</th>
</tr>
<tr>
<th align="left" valign="top">Method</th>
<th align="left" valign="top">Dataset</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle" rowspan="7">VGG16 (<xref ref-type="bibr" rid="ref41">41</xref>)</td>
<td align="left" valign="middle">None</td>
<td align="left" valign="middle">None</td>
<td align="center" valign="middle">73.28</td>
<td align="center" valign="middle">73.39</td>
<td align="center" valign="middle">72.18</td>
</tr>
<tr>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">80.19</td>
<td align="center" valign="middle">80.66</td>
<td align="center" valign="middle">79.83</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">79.79</td>
<td align="center" valign="middle">79.16</td>
<td align="center" valign="middle">78.75</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">79.82</td>
<td align="center" valign="middle">79.24</td>
<td align="center" valign="middle">78.93</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv2 (<xref ref-type="bibr" rid="ref17">17</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">80.24</td>
<td align="center" valign="middle">80.98</td>
<td align="center" valign="middle">80.28</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">81.06</td>
<td align="center" valign="middle">81.27</td>
<td align="center" valign="middle">82.56</td>
</tr>
<tr>
<td align="left" valign="middle">CoTP</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">
<bold>83.54</bold>
</td>
<td align="center" valign="middle">
<bold>86.02</bold>
</td>
<td align="center" valign="middle">
<bold>84.13</bold>
</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="7">DenseNet121 (<xref ref-type="bibr" rid="ref42">42</xref>)</td>
<td align="left" valign="middle">None</td>
<td align="left" valign="middle">None</td>
<td align="center" valign="middle">76.73</td>
<td align="center" valign="middle">76.92</td>
<td align="center" valign="middle">76.77</td>
</tr>
<tr>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">88.06</td>
<td align="center" valign="middle">88.80</td>
<td align="center" valign="middle">88.14</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">86.89</td>
<td align="center" valign="middle">87.49</td>
<td align="center" valign="middle">87.20</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">87.18</td>
<td align="center" valign="middle">87.68</td>
<td align="center" valign="middle">87.33</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv2 (<xref ref-type="bibr" rid="ref17">17</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">88.24</td>
<td align="center" valign="middle">80.98</td>
<td align="center" valign="middle">80.28</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">88.30</td>
<td align="center" valign="middle">88.97</td>
<td align="center" valign="middle">88.36</td>
</tr>
<tr>
<td align="left" valign="middle">CoTP</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">
<bold>91.32</bold>
</td>
<td align="center" valign="middle">
<bold>92.01</bold>
</td>
<td align="center" valign="middle">
<bold>90.92</bold>
</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="7">ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>)</td>
<td align="left" valign="middle">None</td>
<td align="left" valign="middle">None</td>
<td align="center" valign="middle">77.61</td>
<td align="center" valign="middle">77.90</td>
<td align="center" valign="middle">76.69</td>
</tr>
<tr>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">85.63</td>
<td align="center" valign="middle">88.33</td>
<td align="center" valign="middle">86.40</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">84.28</td>
<td align="center" valign="middle">87.49</td>
<td align="center" valign="middle">85.17</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">84.98</td>
<td align="center" valign="middle">87.73</td>
<td align="center" valign="middle">85.67</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv2 (<xref ref-type="bibr" rid="ref17">17</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">88.24</td>
<td align="center" valign="middle">88.98</td>
<td align="center" valign="middle">87.28</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">89.79</td>
<td align="center" valign="middle">89.51</td>
<td align="center" valign="middle">88.69</td>
</tr>
<tr>
<td align="left" valign="middle">CoTP</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">
<bold>92.35</bold>
</td>
<td align="center" valign="middle">
<bold>92.96</bold>
</td>
<td align="center" valign="middle">
<bold>91.54</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The highest scores are shown in boldface.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>ROC curves of the four types of pre-training methods using ResNet50 on the Omicron dataset. Our CoTP achieves the highest 98.90% AUC.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g008.tif"/>
</fig>
<p>As shown in <xref ref-type="fig" rid="fig9">Figure 9</xref>, we measured the dispersion of the test set using a box plot and performed statistical significance testing using a paired t-test. From this, we observed that CoTP significantly outperforms the non-pre-training method (<italic>p</italic>-value &#x003C; le-5) and the supervised method (<italic>p</italic>-value &#x003C; le-5), while performing slightly better than contrastive learning MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>) (<italic>p</italic>-value &#x003C; le-4). Moreover, it can be seen that our CoTP method had better robustness than other types of pre-training methods since the distribution of AUCs was more concentrated.</p>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>Box plot of AUC values produced by different pre-training methods using ResNet50 based on the Omicron dataset. We use 500 bootstrap samples with 300 cases to calculate the AUC which can evaluate the robustness of the model.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g009.tif"/>
</fig>
</sec>
<sec id="sec22">
<label>4.3</label>
<title>Transfer benefit of CoTP pre-training on an external SARS-CoV-2 CT-scan dataset</title>
<p>We conducted experiments to test whether CoTP pre-trained chest CT representations acquired from a source dataset (Omicron dataset) transfer to an external dataset, the SARS-CoV-2 CT-scan dataset. <xref ref-type="table" rid="tab9">Table 8</xref> demonstrates the classification results of the previous methods (<xref ref-type="bibr" rid="ref27">27</xref>, <xref ref-type="bibr" rid="ref28">28</xref>, <xref ref-type="bibr" rid="ref44 ref45 ref46 ref47 ref48 ref49 ref50">44&#x2013;50</xref>) and six types of pre-training methods based on ResNet50, while the confusion matrices for four of them are shown in <xref ref-type="fig" rid="fig10">Figure 10</xref>. Based on these various experimental results, we can draw the following conclusions: (1) Visual representations learned from CoTP have better transferability in downstream tasks than those from ImageNet pre-training. (2) By taking advantage of the ability of our CoTP pre-training, our model outperforms all other contrastive methods on all metrics with a large margin in discriminating between COVID and non-COVID from CT images. For example, the ACC score of CoTP increased by 7.25 and 1.01% comparing the non-pre-training and MoCo-v2 pre-training, respectively.</p>
<table-wrap position="float" id="tab8">
<label>Table 7</label>
<caption>
<p>Statistical significance testing for age and gender in the Omicron dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th align="center" valign="top">Total</th>
<th align="center" valign="top">Mild</th>
<th align="center" valign="top">Severe</th>
<th align="center" valign="top">
<inline-formula>
<mml:math id="M82">
<mml:msup>
<mml:mi>&#x03C7;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:math>
</inline-formula>
</th>
<th align="center" valign="top">
<italic>p</italic>
</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Age&#x2009;&#x003C;&#x2009;60</td>
<td align="center" valign="top">31</td>
<td align="center" valign="top">28 (90.3%)</td>
<td align="center" valign="top">3 (9.7%)</td>
<td align="center" valign="middle" rowspan="2">18.902</td>
<td align="center" valign="middle" rowspan="2">&#x003C; le-3</td>
</tr>
<tr>
<td align="left" valign="top">Age&#x2009;&#x2265;&#x2009;60</td>
<td align="center" valign="top">98</td>
<td align="center" valign="top">45 (45.9%)</td>
<td align="center" valign="top">53 (54.1%)</td>
</tr>
<tr>
<td align="left" valign="top">Male</td>
<td align="center" valign="top">72</td>
<td align="center" valign="top">38 (52.8%)</td>
<td align="center" valign="top">34 (47.2%)</td>
<td align="center" valign="middle" rowspan="2">0.331</td>
<td align="center" valign="middle" rowspan="2">0.565</td>
</tr>
<tr>
<td align="left" valign="top">Female</td>
<td align="center" valign="top">57</td>
<td align="center" valign="top">35 (61.4%)</td>
<td align="center" valign="top">22 (38.6%)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The chi-square test is used to calculate the statistical significance of age and gender.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="tab9">
<label>Table 8</label>
<caption>
<p>The performance of MoCo-TP pre-training on the SARS-CoV-2 CT-scan dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Architectures</th>
<th align="center" valign="top" colspan="2">Pre-training</th>
<th align="center" valign="top" rowspan="2">ACC (%)</th>
<th align="center" valign="top" rowspan="2">AUC (%)</th>
</tr>
<tr>
<th align="left" valign="top">Method</th>
<th align="left" valign="top">Dataset</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Pramod et al. (<xref ref-type="bibr" rid="ref44">44</xref>)</td>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">85.5</td>
<td align="center" valign="middle">96.6</td>
</tr>
<tr>
<td align="left" valign="middle">Even et al. (<xref ref-type="bibr" rid="ref45">45</xref>)</td>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">86.6</td>
<td align="center" valign="middle">86.09</td>
</tr>
<tr>
<td align="left" valign="middle">Yang et al. (<xref ref-type="bibr" rid="ref46">46</xref>)</td>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">89</td>
<td align="center" valign="middle">-</td>
</tr>
<tr>
<td align="left" valign="middle">Ahmed et al. (<xref ref-type="bibr" rid="ref47">47</xref>)</td>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">90.8</td>
<td align="center" valign="middle">90</td>
</tr>
<tr>
<td align="left" valign="middle">Pradeep et al. (<xref ref-type="bibr" rid="ref48">48</xref>)</td>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">-</td>
<td align="center" valign="middle">98</td>
</tr>
<tr>
<td align="left" valign="middle">Wang et al. (<xref ref-type="bibr" rid="ref49">49</xref>)</td>
<td align="left" valign="middle">Contrastive</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">90.83</td>
<td align="center" valign="middle">96.24</td>
</tr>
<tr>
<td align="left" valign="middle">Patel et al. (<xref ref-type="bibr" rid="ref50">50</xref>)</td>
<td align="left" valign="middle">Wavelet transform</td>
<td align="left" valign="middle">None</td>
<td align="center" valign="middle">93.4</td>
<td align="center" valign="middle">93.62</td>
</tr>
<tr>
<td align="left" valign="middle">Harsh et al. (<xref ref-type="bibr" rid="ref27">27</xref>)</td>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">95</td>
<td align="center" valign="middle">95</td>
</tr>
<tr>
<td align="left" valign="middle">Ma et al. (<xref ref-type="bibr" rid="ref28">28</xref>)</td>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">95.16</td>
<td align="center" valign="middle">99.01</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="7">ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>)</td>
<td align="left" valign="middle">None</td>
<td align="left" valign="middle">None</td>
<td align="center" valign="middle">89.13</td>
<td align="center" valign="middle">95.71 (95.65&#x2013;95.78) &#x002A;</td>
</tr>
<tr>
<td align="left" valign="middle">supervised</td>
<td align="left" valign="middle">ImageNet-1&#x2009;K</td>
<td align="center" valign="middle">94.57</td>
<td align="center" valign="middle">98.81 (98.78&#x2013;98.84) &#x002A;</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">93.78</td>
<td align="center" valign="middle">97.82 (97.61&#x2013;98.03) &#x002A;</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">94.20</td>
<td align="center" valign="middle">98.56 (98.42&#x2013;98.70) &#x002A;</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv2 (<xref ref-type="bibr" rid="ref17">17</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">95.06</td>
<td align="center" valign="middle">98.96 (98.80&#x2013;99.02) &#x002A;</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>)</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">95.37</td>
<td align="center" valign="middle">99.26 (98.75&#x2013;99.29) &#x002A;</td>
</tr>
<tr>
<td align="left" valign="middle">CoTP</td>
<td align="left" valign="middle">Omicron</td>
<td align="center" valign="middle">
<bold>96.58</bold>
</td>
<td align="center" valign="middle">
<bold>99.79 (99.78&#x2013;99.80) &#x002A;</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The highest scores are shown in boldface.</p>
<p>&#x002A;Quantitative data were presented as values (95% confidence interval).</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="fig10">
<label>Figure 10</label>
<caption>
<p>Comparison of the performance of four types of pre-training methods in identifying COVID-19 pneumonia from CXR images.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g010.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="sec23">
<label>5</label>
<title>Discussion</title>
<p>Recently, contrastive learning methods have achieved satisfactory results on natural image classification tasks, which can leverage unlabeled data to generate a pre-trained model. However, the existing contrastive mechanisms have scope for improvement for Omicron pneumonia diagnosis from chest CT images due to their inability to mine global features and lack of appropriate augmentations for chest CT images. Therefore, we proposed a novel contrastive learning with token projection, namely CoTP, for improving global visual representation. Furthermore, we leveraged a new data augmentation approach, random Poisson noise perturbation (PNP) to simulate the noise in CT images which is more realistic. In this section, we designed comprehensive ablation studies to assess the effectiveness of each component in the CoTP network.</p>
<sec id="sec24">
<label>5.1</label>
<title>Statistical significance testing for baseline characteristics of patients</title>
<p>First, we utilized the chi-square test for statistical significance test for baseline characteristics of patients, including age and gender. Based on <xref ref-type="table" rid="tab8">Table 7</xref>, we can see that there is no significant difference in gender (<italic>p</italic>&#x2009;=&#x2009;0.597), while age shows a statistical significance (<italic>p</italic>&#x2009;=&#x2009;0.000). It is noted that we need to pay more attention to elderly patients since they are more vulnerable to severe Omicron pneumonia.</p>
</sec>
<sec id="sec25">
<label>5.2</label>
<title>Effects of using different encoders on CoTP performance</title>
<p>Then, we leveraged several encoders VGG16 (<xref ref-type="bibr" rid="ref41">41</xref>), ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>), DenseNet121 (<xref ref-type="bibr" rid="ref42">42</xref>), and Swin-B (<xref ref-type="bibr" rid="ref51">51</xref>) for performance comparison, as shown in <xref ref-type="fig" rid="fig11">Figure 11</xref>. The results indicated that both convolutional neural networks (CNN)-based encoders and transformer-based encoders exhibited higher AUC and better robustness after pre-training with CoTP. Moreover, it can be seen that there are significant differences between pre-training methods (i.e., P&#x2009;&#x2264;&#x2009;le-5 between supervised pre-training method and CoTP when using ResNet50 as a backbone).</p>
<fig position="float" id="fig11">
<label>Figure 11</label>
<caption>
<p>Box plot of AUC values produced by different encoders and types of pre-training methods based on the Omicron dataset. We use 500 bootstrap samples with 300 cases to calculate the AUC which can evaluate the robustness of the model.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g011.tif"/>
</fig>
</sec>
<sec id="sec26">
<label>5.3</label>
<title>Impact of the random Poisson noise perturbation on CoTP performance</title>
<p>In addition, we investigated the impact of the proposed Poisson noise perturbation (PNP) during the data augmentation process. Therefore, we compared the model performance with and without the PNP and also compared it with the random Gaussian noise perturbation (GNP). From <xref ref-type="table" rid="tab10">Table 9</xref>, we found that PNP affected the performance. For example, the VGG16 (<xref ref-type="bibr" rid="ref41">41</xref>) with PNP could achieve an accuracy of 0.79%, a sensitivity of 0.84, and a precision of 0.88%, which are higher than those without PNP. On the contrary, GNP could not significantly improve the performance of the model. The PNP could simulate the noise CT images, which can improve the generalization of the model.</p>
<table-wrap position="float" id="tab10">
<label>Table 9</label>
<caption>
<p>Impact of the random Poisson noise perturbation on model performance based on the Omicron dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th/>
<th align="center" valign="top" colspan="2">VGG16 (<xref ref-type="bibr" rid="ref41">41</xref>)</th>
<th/>
<th align="center" valign="top" colspan="2">ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>)</th>
<th/>
<th align="center" valign="top" colspan="2">DenseNet121 (<xref ref-type="bibr" rid="ref42">42</xref>)</th>
</tr>
</thead>
<tbody>
<tr>
<td/>
<td align="center" valign="middle">ACC</td>
<td align="center" valign="middle">SEN</td>
<td align="center" valign="middle">PRE</td>
<td align="center" valign="middle">ACC</td>
<td align="center" valign="middle">SEN</td>
<td align="center" valign="middle">PRE</td>
<td align="center" valign="middle">ACC</td>
<td align="center" valign="middle">SEN</td>
<td align="center" valign="middle">PRE</td>
</tr>
<tr>
<td align="left" valign="middle">w/o PNP</td>
<td align="center" valign="middle">82.79</td>
<td align="center" valign="middle">85.18</td>
<td align="center" valign="middle">83.29</td>
<td align="center" valign="middle">91.27</td>
<td align="center" valign="middle">92.23</td>
<td align="center" valign="middle">90.95</td>
<td align="center" valign="middle">90.78</td>
<td align="center" valign="middle">91.22</td>
<td align="center" valign="middle">90.09</td>
</tr>
<tr>
<td align="left" valign="middle">GNP</td>
<td align="center" valign="middle">82.84</td>
<td align="center" valign="middle">85.12</td>
<td align="center" valign="middle">83.22</td>
<td align="center" valign="middle">91.36</td>
<td align="center" valign="middle">92.10</td>
<td align="center" valign="middle">90.62</td>
<td align="center" valign="middle">90.74</td>
<td align="center" valign="middle">91.13</td>
<td align="center" valign="middle">89.92</td>
</tr>
<tr>
<td align="left" valign="middle">PNP</td>
<td align="center" valign="middle">
<bold>83.58</bold>
</td>
<td align="center" valign="middle">
<bold>86.02</bold>
</td>
<td align="center" valign="middle">
<bold>84.17</bold>
</td>
<td align="center" valign="top">
<bold>92.35</bold>
</td>
<td align="center" valign="top">
<bold>92.96</bold>
</td>
<td align="center" valign="top">
<bold>91.54</bold>
</td>
<td align="center" valign="middle">
<bold>91.36</bold>
</td>
<td align="center" valign="middle">
<bold>92.09</bold>
</td>
<td align="center" valign="middle">
<bold>90.96</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The highest scores in each model are shown in boldface.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec27">
<label>5.4</label>
<title>Impact of the MAP head on subsequent fine-tuning performance</title>
<p>To evaluate the ability of the MAP head on subsequent fine-tuning performance, the traditional classification (TC) head, which typically consists of a global pooling operation and a fully connected layer, was used for comparative experiments on the SARS-CoV-2 CT-scan dataset. Here, we used ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>) pre-trained by our CoTP as the backbone. Based on <xref ref-type="fig" rid="fig12">Figure 12</xref>, we found that the proposed MAP head outperforms the TC head and achieved the best overall performance with <inline-formula>
<mml:math id="M83">
<mml:mi mathvariant="normal">&#x03BB;</mml:mi>
</mml:math>
</inline-formula>= 0.02.</p>
<fig position="float" id="fig12">
<label>Figure 12</label>
<caption>
<p>Effect of the MAP head on subsequent fine-tuning performance.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g012.tif"/>
</fig>
</sec>
<sec id="sec28">
<label>5.5</label>
<title>Impact of the training data size</title>
<p>To study the transferable ability of the model under limited labels during the fine-tuning phase, we experimented with 10, 25, 50, 75, and 100% training data size on the SARS-CoV-2 CT-scan dataset. As shown in <xref ref-type="fig" rid="fig13">Figure 13</xref>, we illustrated three types of pre-training methods based on ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>). The weight of none pre-training method was randomly initialized, and the supervised pre-training method was pre-trained on ImageNet. From the results, it can be inferred that the expected trend of improving ACC is with an increase in labeled data for the fine-tuning phase. Moreover, it is promising to observe that even with a 50% training data size, the CoTP asymptotically approaches the fully supervised (100% training data size) setup.</p>
<fig position="float" id="fig13">
<label>Figure 13</label>
<caption>
<p>Effect of training data size on model performance.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g013.tif"/>
</fig>
</sec>
<sec id="sec29">
<label>5.6</label>
<title>Visualization of grad-CAM heat map</title>
<p>Finally, we illustrated Grad-CAM (<xref ref-type="bibr" rid="ref52">52</xref>) visualizations of the features learned by different pre-training methods based on ResNet50 in <xref ref-type="fig" rid="fig14">Figure 14</xref>. The higher response was highlighted in red while the lower one was demonstrated in blue. The expert annotation of the infected regions was indicated by a red dotted circle. As can be seen, the heatmaps generated by non-pre-training method are fuzzy and blurred. In addition, the heatmaps yielded by the supervised pre-training method focused on the edge areas of CT images. On the contrary, our CoTP learned more features that focus on the infection region, which can improve the classification accuracy in comparison with approaches.</p>
<fig position="float" id="fig14">
<label>Figure 14</label>
<caption>
<p>Grad-CAM visualizations of the features learned by different methods. The top row shows the original image set, followed by non-pre-training, pre-training with ImageNet, and CoTP.</p>
</caption>
<graphic xlink:href="fmed-11-1360143-g014.tif"/>
</fig>
</sec>
<sec id="sec30">
<label>5.7</label>
<title>Comparison of inference efficiency</title>
<p>To assess the inference efficiency, we calculated the pre-training time and parameters of MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>), SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>), MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>), and our CoTP on the Omicron dataset. As shown in <xref ref-type="table" rid="tab11">Table 10</xref>, we obtained the following observations: (1) The parameters of the methods are nearly identical. MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>) adds a simple linear projection based on ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>). Meanwhile, our CoTP included an efficient token projection in addition to ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>). (2) The training time of SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>) is the shortest among the methods because it does not utilize a memory bank. (3) Although our CoTP slightly exceeds other methods in terms of training time and parameters, it achieves the highest accuracy of 92.35% and significantly outperforms the other methods.</p>
<table-wrap position="float" id="tab11">
<label>Table 10</label>
<caption>
<p>Comparison of model efficiency on the Omicron dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Method</th>
<th align="left" valign="top">Architecture</th>
<th align="center" valign="top">Training time (h)</th>
<th align="center" valign="top">Para (M)</th>
<th align="center" valign="top">ACC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">MoCo-v1 (<xref ref-type="bibr" rid="ref18">18</xref>)</td>
<td align="left" valign="middle">ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>)</td>
<td align="center" valign="middle">7.8</td>
<td align="center" valign="middle">24.03</td>
<td align="center" valign="middle">84.98</td>
</tr>
<tr>
<td align="left" valign="middle">SimCLRv1 (<xref ref-type="bibr" rid="ref15">15</xref>)</td>
<td align="left" valign="middle">ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>)</td>
<td align="center" valign="middle">6.4</td>
<td align="center" valign="middle">24.03</td>
<td align="center" valign="middle">84.28</td>
</tr>
<tr>
<td align="left" valign="middle">MoCo-v2 (<xref ref-type="bibr" rid="ref19">19</xref>)</td>
<td align="left" valign="middle">ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>) + Linear projection</td>
<td align="center" valign="middle">7.9</td>
<td align="center" valign="middle">24.10</td>
<td align="center" valign="middle">89.79</td>
</tr>
<tr>
<td align="left" valign="middle">CoTP (Ours)</td>
<td align="left" valign="middle">ResNet50 (<xref ref-type="bibr" rid="ref23">23</xref>) + Token projection</td>
<td align="center" valign="middle">8.1</td>
<td align="center" valign="middle">24.23</td>
<td align="center" valign="middle">
<bold>92.35</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The highest scores in each model are shown in boldface.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec sec-type="conclusions" id="sec31">
<label>6</label>
<title>Conclusion</title>
<p>The existing contrastive mechanisms have scope for improvement for Omicron pneumonia diagnosis from chest CT images due to their inability to mine global features and lack of appropriate augmentations for chest CT images. Therefore, we proposed a novel contrastive learning model with token projection, namely CoTP, for improving few-shot Omicron chest CT image diagnostic quality. Specifically, we designed the token projection to extract the global visual representation from unlabeled CT images. Furthermore, we leveraged random Poisson noise perturbation to simulate the noise CT images as a novel data augmentation. In addition, the MAP head which can obtain different spatial regions occupied by objects of different categories was employed to improve classification performance for subsequent fine-tuning. Extensive experiments on collected datasets demonstrated that our CoTP can provide high-quality representations and transferable initializations for CT image interpretation. In the future, we plan to design more effective pretext tasks and apply the proposed method to more medical image analysis tasks. For image segmentation and edge detection, we can employ the pre-trained encoder as a feature extraction, and then add a segmentation head or a detection head.</p>
</sec>
<sec sec-type="data-availability" id="sec32">
<title>Data availability statement</title>
<p>The data analyzed in this study is subject to the following licenses/restrictions: The SARS-CoV-2 CT-scan dataset (<xref ref-type="bibr" rid="ref21">21</xref>) is available online. Omicron data that support the findings of this study are available from the corresponding author, YZ, upon reasonable request. Requests to access these datasets should be directed to <email>zhuyu@ecust.edu.cn</email>.</p>
</sec>
<sec sec-type="author-contributions" id="sec33">
<title>Author contributions</title>
<p>XJ: Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Software. DY: Writing &#x2013; review &#x0026; editing, Supervision. LF: Writing &#x2013; review &#x0026; editing, Data curation. YZ: Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Visualization, Validation, Methodology. MW: Writing &#x2013; review &#x0026; editing, Formal analysis, Data curation. YF: Writing &#x2013; review &#x0026; editing, Resources, Data curation. CB: Writing &#x2013; review &#x0026; editing, Supervision, Resources. HF: Writing &#x2013; review &#x0026; editing, Visualization, Supervision.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="sec34">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. The authors greatly appreciate the financial support from the National Natural Science Foundation of China (81971863, 82170110), the Shanghai Natural Science Foundation (22ZR1444700), the Shanghai Shenkang project for transformation for scientific production (SHDC2022CRD049), the Fujian Province Department of Science and Technology (2022D014), the Shanghai Pujiang Program (20PJ1402400), the Science and Technology Commission of Shanghai Municipality (20DZ2254400, 20DZ2261200), the Shanghai Municipal Science and Technology Major Project (ZD2021CY001), and the Shanghai Municipal Key Clinical Specialty (shslczdzk02201).</p>
</sec>
<sec sec-type="COI-statement" id="sec35">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The author(s) declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec id="sec100" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Z</given-names></name> <name><surname>Deng</surname> <given-names>X</given-names></name> <name><surname>Fang</surname> <given-names>L</given-names></name> <name><surname>Sun</surname> <given-names>K</given-names></name> <name><surname>Wu</surname> <given-names>Y</given-names></name> <name><surname>Che</surname> <given-names>T</given-names></name> <etal/></person-group>. <article-title>Epidemiological characteristics and transmission dynamics of the outbreak caused by the SARS-CoV-2 omicron variant in Shanghai, China: a descriptive study</article-title>. <source>Lancet Reg Health-Western Pac</source>. (<year>2022</year>) <volume>29</volume>:<fpage>100592</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.lanwpc.2022.100592</pub-id>, PMID: <pub-id pub-id-type="pmid">36090701</pub-id></citation></ref>
<ref id="ref2"><label>2.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Lai</surname> <given-names>S</given-names></name> <name><surname>Gao</surname> <given-names>GF</given-names></name> <name><surname>Shi</surname> <given-names>W</given-names></name></person-group>. <article-title>The emergence, genomic diversity and global spread of SARS-CoV-2</article-title>. <source>Nature</source>. (<year>2021</year>) <volume>600</volume>:<fpage>408</fpage>&#x2013;<lpage>18</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41586-021-04188-6</pub-id></citation></ref>
<ref id="ref3"><label>3.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tian</surname> <given-names>D</given-names></name> <name><surname>Sun</surname> <given-names>Y</given-names></name> <name><surname>Xu</surname> <given-names>H</given-names></name> <name><surname>Ye</surname> <given-names>Q</given-names></name></person-group>. <article-title>The emergence and epidemic characteristics of the highly mutated SARS-CoV-2 omicron variant</article-title>. <source>J Med Virol</source>. (<year>2022</year>) <volume>94</volume>:<fpage>2376</fpage>&#x2013;<lpage>83</lpage>. doi: <pub-id pub-id-type="doi">10.1002/jmv.27643</pub-id>, PMID: <pub-id pub-id-type="pmid">35118687</pub-id></citation></ref>
<ref id="ref4"><label>4.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>X</given-names></name> <name><surname>Yan</surname> <given-names>X</given-names></name> <name><surname>Sun</surname> <given-names>K</given-names></name> <name><surname>Zheng</surname> <given-names>N</given-names></name> <name><surname>Sun</surname> <given-names>R</given-names></name> <name><surname>Zhou</surname> <given-names>J</given-names></name> <etal/></person-group>. <article-title>Estimation of disease burden and clinical severity of COVID-19 caused by omicron BA. 2 in Shanghai, February-June 2022</article-title>. <source>Emerg Microb Infect</source>. (<year>2022</year>) <volume>11</volume>:<fpage>2800</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1101/2022.07.11.22277504</pub-id>, PMID: <pub-id pub-id-type="pmid">36205530</pub-id></citation></ref>
<ref id="ref5"><label>5.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wilder-Smith</surname> <given-names>A</given-names></name> <name><surname>Freedman</surname> <given-names>DO</given-names></name></person-group>. <article-title>Isolation, quarantine, social distancing and community containment: pivotal role for old-style public health measures in the novel coronavirus (2019-nCoV) outbreak</article-title>. <source>J Travel Med</source>. (<year>2020</year>) <volume>27</volume>:<fpage>1</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1093/jtm/taaa020</pub-id>, PMID: <pub-id pub-id-type="pmid">32052841</pub-id></citation></ref>
<ref id="ref6"><label>6.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Van Elden</surname> <given-names>LJ</given-names></name> <name><surname>Anton</surname> <given-names>MAM</given-names></name> <name><surname>Van Alphen</surname> <given-names>F</given-names></name> <name><surname>Hendriksen</surname> <given-names>KA</given-names></name> <name><surname>Hoepelman</surname> <given-names>AI</given-names></name> <name><surname>Van Kraaij</surname> <given-names>MG</given-names></name> <etal/></person-group>. <article-title>Frequent detection of human coronaviruses in clinical specimens from patients with respiratory tract infection by use of a novel real-time reverse-transcriptase polymerase chain reaction</article-title>. <source>J Infect Dis</source>. (<year>2004</year>) <volume>189</volume>:<fpage>652</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1086/381207</pub-id>, PMID: <pub-id pub-id-type="pmid">14767819</pub-id></citation></ref>
<ref id="ref7"><label>7.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ai</surname> <given-names>T</given-names></name> <name><surname>Yang</surname> <given-names>Z</given-names></name> <name><surname>Hou</surname> <given-names>H</given-names></name> <name><surname>Zhan</surname> <given-names>C</given-names></name> <name><surname>Chen</surname> <given-names>C</given-names></name> <name><surname>Lv</surname> <given-names>W</given-names></name> <etal/></person-group>. <article-title>Correlation of chest CT and RT-PCR testing in coronavirus disease 2019 (COVID-19) in China: a report of 1014 cases</article-title>. <source>Radiology</source>. (<year>2020</year>) <volume>296</volume>:<fpage>E32</fpage>&#x2013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1148/radiol.2020200642</pub-id>, PMID: <pub-id pub-id-type="pmid">32101510</pub-id></citation></ref>
<ref id="ref8"><label>8.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Q</given-names></name> <name><surname>Liu</surname> <given-names>Q</given-names></name> <name><surname>Xu</surname> <given-names>H</given-names></name> <name><surname>Lu</surname> <given-names>H</given-names></name> <name><surname>Liu</surname> <given-names>S</given-names></name> <name><surname>Li</surname> <given-names>H</given-names></name></person-group>. <article-title>Imaging of coronavirus disease 2019: a Chinese expert consensus statement</article-title>. <source>Eur J Radiol</source>. (<year>2020</year>) <volume>127</volume>:<fpage>109008</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ejrad.2020.109008</pub-id>, PMID: <pub-id pub-id-type="pmid">32335426</pub-id></citation></ref>
<ref id="ref9"><label>9.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>X</given-names></name> <name><surname>Hui</surname> <given-names>H</given-names></name> <name><surname>Niu</surname> <given-names>M</given-names></name> <name><surname>Li</surname> <given-names>L</given-names></name> <name><surname>Wang</surname> <given-names>L</given-names></name> <name><surname>He</surname> <given-names>B</given-names></name> <etal/></person-group>. <article-title>Deep learning-based multi-view fusion model for screening 2019 novel coronavirus pneumonia: a multicentre study</article-title>. <source>Eur J Radiol</source>. (<year>2020</year>) <volume>128</volume>:<fpage>109041</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ejrad.2020.109041</pub-id>, PMID: <pub-id pub-id-type="pmid">32408222</pub-id></citation></ref>
<ref id="ref10"><label>10.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y</given-names></name> <name><surname>Bengio</surname> <given-names>Y</given-names></name> <name><surname>Hinton</surname> <given-names>G</given-names></name></person-group>. <article-title>Deep learning</article-title>. <source>Nature</source>. (<year>2015</year>) <volume>521</volume>:<fpage>436</fpage>&#x2013;<lpage>44</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nature14539</pub-id></citation></ref>
<ref id="ref11"><label>11.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kavakiotis</surname> <given-names>I</given-names></name> <name><surname>Tsave</surname> <given-names>O</given-names></name> <name><surname>Salifoglou</surname> <given-names>A</given-names></name> <name><surname>Maglaveras</surname> <given-names>N</given-names></name> <name><surname>Vlahavas</surname> <given-names>I</given-names></name> <name><surname>Chouvarda</surname> <given-names>I</given-names></name></person-group>. <article-title>Machine learning and data mining methods in diabetes research</article-title>. <source>Comput Struct Biotechnol J</source>. (<year>2017</year>) <volume>15</volume>:<fpage>104</fpage>&#x2013;<lpage>16</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.csbj.2016.12.005</pub-id>, PMID: <pub-id pub-id-type="pmid">28138367</pub-id></citation></ref>
<ref id="ref12"><label>12.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Willemink</surname> <given-names>MJ</given-names></name> <name><surname>Koszek</surname> <given-names>WA</given-names></name> <name><surname>Hardell</surname> <given-names>C</given-names></name> <name><surname>Wu</surname> <given-names>J</given-names></name> <name><surname>Fleischmann</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>Preparing medical imaging data for machine learning</article-title>. <source>Radiology</source>. (<year>2020</year>) <volume>295</volume>:<fpage>4</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.1148/radiol.2020192224</pub-id>, PMID: <pub-id pub-id-type="pmid">32068507</pub-id></citation></ref>
<ref id="ref13"><label>13.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Sowrirajan</surname> <given-names>H</given-names></name> <name><surname>Yang</surname> <given-names>J</given-names></name> <name><surname>Ng</surname> <given-names>AY</given-names></name> <name><surname>Rajpurkar</surname> <given-names>P</given-names></name></person-group> (<year>2021</year>) <article-title>Moco pretraining improves representation and transferability of chest x-ray models</article-title>. <conf-name>International Conference on Medical Imaging with deep learning (MIDL)</conf-name>.</citation></ref>
<ref id="ref14"><label>14.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>D</given-names></name> <name><surname>Wu</surname> <given-names>G</given-names></name> <name><surname>Suk</surname> <given-names>H-I</given-names></name></person-group>. <article-title>Deep learning in medical image analysis</article-title>. <source>Annu Rev Biomed Eng</source>. (<year>2017</year>) <volume>19</volume>:<fpage>221</fpage>&#x2013;<lpage>48</lpage>. doi: <pub-id pub-id-type="doi">10.1146/annurev-bioeng-071516-044442</pub-id>, PMID: <pub-id pub-id-type="pmid">28301734</pub-id></citation></ref>
<ref id="ref15"><label>15.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T</given-names></name> <name><surname>Kornblith</surname> <given-names>S</given-names></name> <name><surname>Norouzi</surname> <given-names>M</given-names></name> <name><surname>Hinton</surname> <given-names>G</given-names></name></person-group> (<year>2020</year>) <article-title>A simple framework for contrastive learning of visual representations</article-title><conf-name>International conference on machine learning (ICML)</conf-name>.</citation></ref>
<ref id="ref16"><label>16.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Z</given-names></name> <name><surname>Xiong</surname> <given-names>Y</given-names></name> <name><surname>Yu</surname> <given-names>SX</given-names></name> <name><surname>Lin</surname> <given-names>D</given-names></name></person-group> (<year>2018</year>) <article-title>Unsupervised feature learning via non-parametric instance discrimination</article-title>. <conf-name>IEEE/CVF conference on computer vision and pattern recognition (CVPR)</conf-name>.</citation></ref>
<ref id="ref17"><label>17.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T</given-names></name> <name><surname>Kornblith</surname> <given-names>S</given-names></name> <name><surname>Swersky</surname> <given-names>K</given-names></name> <name><surname>Norouzi</surname> <given-names>M</given-names></name> <name><surname>Hinton</surname> <given-names>GE</given-names></name></person-group> (<year>2020</year>) <article-title>Big self-supervised models are strong semi-supervised learners</article-title>. <conf-name>Annual conference on neural information processing systems (Neur IPS)</conf-name>.</citation></ref>
<ref id="ref18"><label>18.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K</given-names></name> <name><surname>Fan</surname> <given-names>H</given-names></name> <name><surname>Wu</surname> <given-names>Y</given-names></name> <name><surname>Xie</surname> <given-names>S</given-names></name> <name><surname>Girshick</surname> <given-names>R</given-names></name></person-group> (<year>2020</year>) <article-title>Momentum contrast for unsupervised visual representation learning</article-title>. <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>.</citation></ref>
<ref id="ref19"><label>19.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>X</given-names></name> <name><surname>Fan</surname> <given-names>H</given-names></name> <name><surname>Girshick</surname> <given-names>R</given-names></name> <name><surname>He</surname> <given-names>K</given-names></name></person-group>. <article-title>Improved baselines with momentum contrastive learning</article-title>. <source>ar Xiv</source>. (<year>2020</year>). doi: <pub-id pub-id-type="doi">10.48550/arXiv.2003.04297</pub-id></citation></ref>
<ref id="ref20"><label>20.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A</given-names></name> <name><surname>Shazeer</surname> <given-names>N</given-names></name> <name><surname>Parmar</surname> <given-names>N</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J</given-names></name> <name><surname>Jones</surname> <given-names>L</given-names></name> <name><surname>Gomez</surname> <given-names>AN</given-names></name> <etal/></person-group>. <article-title>Attention is all you need</article-title>. <source>Neural Inform Process Syst</source>. (<year>2017</year>) <volume>30</volume>:<fpage>5998</fpage>&#x2013;<lpage>08</lpage>.</citation></ref>
<ref id="ref21"><label>21.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Soares</surname> <given-names>E</given-names></name> <name><surname>Angelov</surname> <given-names>P</given-names></name> <name><surname>Biaso</surname> <given-names>S</given-names></name> <name><surname>Froes</surname> <given-names>MH</given-names></name> <name><surname>Abe</surname> <given-names>DK</given-names></name></person-group>. <article-title>SARS-CoV-2 CT-scan dataset: a large dataset of real patients CT scans for SARS-CoV-2 identification</article-title>. <source>Med Rxiv</source>. (<year>2020</year>) <volume>10</volume>:<fpage>1</fpage>&#x2013;<lpage>8</lpage>.</citation></ref>
<ref id="ref22"><label>22.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mei</surname> <given-names>X</given-names></name> <name><surname>Lee</surname> <given-names>H-C</given-names></name> <name><surname>Diao</surname> <given-names>K-y</given-names></name> <name><surname>Huang</surname> <given-names>M</given-names></name> <name><surname>Lin</surname> <given-names>B</given-names></name> <name><surname>Liu</surname> <given-names>C</given-names></name> <etal/></person-group>. <article-title>Artificial intelligence&#x2013;enabled rapid diagnosis of patients with COVID-19</article-title>. <source>Nat Med</source>. (<year>2020</year>) <volume>26</volume>:<fpage>1224</fpage>&#x2013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41591-020-0931-3</pub-id>, PMID: <pub-id pub-id-type="pmid">32427924</pub-id></citation></ref>
<ref id="ref23"><label>23.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Ren</surname> <given-names>S</given-names></name> <name><surname>Sun</surname> <given-names>J</given-names></name></person-group> (<year>2016</year>) <article-title>Deep residual learning for image recognition</article-title>. <conf-name>IEEE/CVF Conference Computing Vision Pattern Recognisition</conf-name>.</citation></ref>
<ref id="ref24"><label>24.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J</given-names></name> <name><surname>Wu</surname> <given-names>L</given-names></name> <name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Zhang</surname> <given-names>L</given-names></name> <name><surname>Gong</surname> <given-names>D</given-names></name> <name><surname>Zhao</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>Deep learning-based model for detecting 2019 novel coronavirus pneumonia on high-resolution computed tomography</article-title>. <source>Sci Rep</source>. (<year>2020</year>) <volume>10</volume>:<fpage>19196</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-020-76282-0</pub-id>, PMID: <pub-id pub-id-type="pmid">33154542</pub-id></citation></ref>
<ref id="ref25"><label>25.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Qiu</surname> <given-names>Y</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Li</surname> <given-names>S</given-names></name> <name><surname>Xu</surname> <given-names>J</given-names></name></person-group>. <article-title>Miniseg: an extremely minimum network for efficient covid-19 segmentation</article-title>. In: <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence. Virtually: AAAI</conf-name> (<year>2021</year>).</citation></ref>
<ref id="ref26"><label>26.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>G</given-names></name> <name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Li</surname> <given-names>C</given-names></name> <name><surname>Xu</surname> <given-names>Z</given-names></name> <name><surname>Ruan</surname> <given-names>J</given-names></name> <name><surname>Zhu</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>A noise-robust framework for automatic segmentation of COVID-19 pneumonia lesions from CT images</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2020</year>) <volume>39</volume>:<fpage>2653</fpage>&#x2013;<lpage>63</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2020.3000314</pub-id>, PMID: <pub-id pub-id-type="pmid">32730215</pub-id></citation></ref>
<ref id="ref27"><label>27.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Panwar</surname> <given-names>H</given-names></name> <name><surname>Gupta</surname> <given-names>P</given-names></name> <name><surname>Siddiqui</surname> <given-names>MK</given-names></name> <name><surname>Morales-Menendez</surname> <given-names>R</given-names></name> <name><surname>Bhardwaj</surname> <given-names>P</given-names></name> <name><surname>Singh</surname> <given-names>VJC</given-names></name> <etal/></person-group>. <article-title>A deep learning and grad-CAM based color visualization approach for fast detection of COVID-19 cases using chest X-ray and CT-scan images</article-title>. <source>Chaos Solitons Fractals</source>. (<year>2020</year>) <volume>140</volume>:<fpage>110190</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.chaos.2020.110190</pub-id>, PMID: <pub-id pub-id-type="pmid">32836918</pub-id></citation></ref>
<ref id="ref28"><label>28.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>X</given-names></name> <name><surname>Zheng</surname> <given-names>B</given-names></name> <name><surname>Zhu</surname> <given-names>Y</given-names></name> <name><surname>Yu</surname> <given-names>F</given-names></name> <name><surname>Zhang</surname> <given-names>R</given-names></name> <name><surname>Chen</surname> <given-names>B</given-names></name></person-group>. <article-title>COVID-19 lesion discrimination and localization network based on multi-receptive field attention module on CT images</article-title>. <source>Optik</source>. (<year>2021</year>) <volume>241</volume>:<fpage>167100</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ijleo.2021.167100</pub-id>, PMID: <pub-id pub-id-type="pmid">33976457</pub-id></citation></ref>
<ref id="ref29"><label>29.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>P</given-names></name> <name><surname>Yin</surname> <given-names>X</given-names></name> <name><surname>Lu</surname> <given-names>H</given-names></name> <name><surname>Hu</surname> <given-names>Z</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Jiang</surname> <given-names>R</given-names></name> <etal/></person-group>. <article-title>CS-CO: a hybrid self-supervised visual representation learning method for H &#x0026; E-stained histopathological images</article-title>. <source>Med Image Anal</source>. (<year>2022</year>) <volume>81</volume>:<fpage>102539</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2022.102539</pub-id>, PMID: <pub-id pub-id-type="pmid">35926337</pub-id></citation></ref>
<ref id="ref30"><label>30.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Jiang</surname> <given-names>H</given-names></name> <name><surname>Miura</surname> <given-names>Y</given-names></name> <name><surname>Manning</surname> <given-names>CD</given-names></name> <name><surname>Langlotz</surname> <given-names>CP</given-names></name></person-group>. <article-title>Contrastive learning of medical visual representations from paired images and text</article-title>. <source>ar Xiv</source>. (<year>2020</year>) <volume>2010</volume>:<fpage>2</fpage>&#x2013;<lpage>25</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2010.00747</pub-id></citation></ref>
<ref id="ref31"><label>31.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chaitanya</surname> <given-names>K</given-names></name> <name><surname>Erdil</surname> <given-names>E</given-names></name> <name><surname>Karani</surname> <given-names>N</given-names></name> <name><surname>Konukoglu</surname> <given-names>E</given-names></name></person-group>. <article-title>Contrastive learning of global and local features for medical image segmentation with limited annotations</article-title>. <source>Adv Neural Inf Proces Syst</source>. (<year>2020</year>) <volume>33</volume>:<fpage>12546</fpage>&#x2013;<lpage>58</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2006.10511</pub-id></citation></ref>
<ref id="ref32"><label>32.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zeng</surname> <given-names>D</given-names></name> <name><surname>Wu</surname> <given-names>Y</given-names></name> <name><surname>Hu</surname> <given-names>X</given-names></name> <name><surname>Xu</surname> <given-names>X</given-names></name> <name><surname>Yuan</surname> <given-names>H</given-names></name> <name><surname>Huang</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Positional contrastive learning for volumetric medical image segmentation</article-title>. <conf-name>International Conference on Medical Image Computing and Computer-assisted Intervention</conf-name>. <publisher-loc>Strasbourg</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2021</year>).</citation></ref>
<ref id="ref33"><label>33.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y</given-names></name> <name><surname>Zeng</surname> <given-names>D</given-names></name> <name><surname>Wang</surname> <given-names>Z</given-names></name> <name><surname>Shi</surname> <given-names>Y</given-names></name> <name><surname>Hu</surname> <given-names>J</given-names></name></person-group>. <article-title>Distributed contrastive learning for medical image segmentation</article-title>. <source>Med Image Anal</source>. (<year>2022</year>) <volume>81</volume>:<fpage>102564</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2022.102564</pub-id></citation></ref>
<ref id="ref34"><label>34.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khalifa</surname> <given-names>NE</given-names></name> <name><surname>Loey</surname> <given-names>M</given-names></name> <name><surname>Mirjalili</surname> <given-names>S</given-names></name></person-group>. <article-title>A comprehensive survey of recent trends in deep learning for digital images augmentation</article-title>. <source>Artif Intell Rev</source>. (<year>2022</year>) <volume>55</volume>:<fpage>2351</fpage>&#x2013;<lpage>77</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10462-021-10066-4</pub-id>, PMID: <pub-id pub-id-type="pmid">34511694</pub-id></citation></ref>
<ref id="ref35"><label>35.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Boyat</surname> <given-names>AK</given-names></name> <name><surname>Joshi</surname> <given-names>BK</given-names></name></person-group>. <article-title>A review paper: noise models in digital image processing</article-title>. <source>Sig Image Process</source>. (<year>2015</year>) <volume>6</volume>:<fpage>63</fpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1505.03489</pub-id></citation></ref>
<ref id="ref36"><label>36.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Evangelista</surname> <given-names>RC</given-names></name> <name><surname>Salvadeo</surname> <given-names>DH</given-names></name> <name><surname>Mascarenhas</surname> <given-names>ND</given-names></name></person-group>. <article-title>A new bayesian Poisson denoising algorithm based on nonlocal means and stochastic distances</article-title>. <source>Pattern Recog Lett</source>. (<year>2022</year>) <volume>122</volume>:<fpage>108363</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patcog.2021.108363</pub-id></citation></ref>
<ref id="ref37"><label>37.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhuang</surname> <given-names>T</given-names></name> <name><surname>Leng</surname> <given-names>S</given-names></name> <name><surname>Nett</surname> <given-names>BE</given-names></name> <name><surname>Chen</surname> <given-names>G-H</given-names></name></person-group>. <article-title>Fan-beam and cone-beam image reconstruction via filtering the backprojection image of differentiated projection data</article-title>. <source>Phys Med Biol</source>. (<year>2004</year>) <volume>49</volume>:<fpage>5489</fpage>&#x2013;<lpage>503</lpage>. doi: <pub-id pub-id-type="doi">10.1088/0031-9155/49/24/007</pub-id>, PMID: <pub-id pub-id-type="pmid">15724538</pub-id></citation></ref>
<ref id="ref38"><label>38.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stierstorfer</surname> <given-names>K</given-names></name> <name><surname>Rauscher</surname> <given-names>A</given-names></name> <name><surname>Boese</surname> <given-names>J</given-names></name> <name><surname>Bruder</surname> <given-names>H</given-names></name> <name><surname>Schaller</surname> <given-names>S</given-names></name> <name><surname>Flohr</surname> <given-names>T</given-names></name></person-group>. <article-title>Weighted FBP&#x2014;a simple approximate 3D FBP algorithm for multislice spiral CT with good dose usage for arbitrary pitch</article-title>. <source>Phys Med Biol</source>. (<year>2004</year>) <volume>49</volume>:<fpage>2209</fpage>&#x2013;<lpage>18</lpage>. doi: <pub-id pub-id-type="doi">10.1088/0031-9155/49/11/007</pub-id>, PMID: <pub-id pub-id-type="pmid">15248573</pub-id></citation></ref>
<ref id="ref39"><label>39.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>J</given-names></name> <name><surname>Ling</surname> <given-names>CX</given-names></name></person-group>. <article-title>Using AUC and accuracy in evaluating learning algorithms</article-title>. <source>IEEE Trans Knowl Data Eng</source>. (<year>2005</year>) <volume>17</volume>:<fpage>299</fpage>&#x2013;<lpage>10</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TKDE.2005.50</pub-id></citation></ref>
<ref id="ref40"><label>40.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Barber</surname> <given-names>JA</given-names></name> <name><surname>Thompson</surname> <given-names>SG</given-names></name></person-group>. <article-title>Analysis of cost data in randomized trials: an application of the non-parametric bootstrap</article-title>. <source>Stat Med</source>. (<year>2000</year>) <volume>19</volume>:<fpage>3219</fpage>&#x2013;<lpage>36</lpage>. PMID: <pub-id pub-id-type="pmid">11113956</pub-id></citation></ref>
<ref id="ref41"><label>41.</label> <citation citation-type="other"><person-group person-group-type="author"><name><surname>Simonyan</surname> <given-names>K</given-names></name> <name><surname>Zisserman</surname> <given-names>A</given-names></name></person-group>. <article-title>Very deep convolutional networks for large-scale image recognition</article-title>. Ar xiv preprint ar xiv: 409.1556. (<year>2014</year>).</citation></ref>
<ref id="ref42"><label>42.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>G</given-names></name> <name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Van Der Maaten</surname> <given-names>L</given-names></name> <name><surname>Weinberger</surname> <given-names>KQ</given-names></name></person-group> (<year>2017</year>). <article-title>Densely connected convolutional networks</article-title>. <conf-name>IEEE/CVF conference on computer vision and pattern recognition (CVPR)</conf-name>.</citation></ref>
<ref id="ref43"><label>43.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>J</given-names></name> <name><surname>Dong</surname> <given-names>W</given-names></name> <name><surname>Socher</surname> <given-names>R</given-names></name> <name><surname>Li</surname> <given-names>L-J</given-names></name> <name><surname>Li</surname> <given-names>K</given-names></name> <name><surname>Fei-Fei</surname> <given-names>L</given-names></name></person-group>. <article-title>Imagenet: a large-scale hierarchical image database</article-title>. In <conf-name>2009 IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <publisher-loc>Miami</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2009</year>).</citation></ref>
<ref id="ref44"><label>44.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gaur</surname> <given-names>P</given-names></name> <name><surname>Malaviya</surname> <given-names>V</given-names></name> <name><surname>Gupta</surname> <given-names>A</given-names></name> <name><surname>Bhatia</surname> <given-names>G</given-names></name> <name><surname>Pachori</surname> <given-names>RB</given-names></name> <name><surname>Sharma</surname> <given-names>D</given-names></name></person-group>. <article-title>COVID-19 disease identification from chest CT images using empirical wavelet transformation and transfer learning</article-title>. <source>Biomed Sig Process Control</source>. (<year>2022</year>) <volume>71</volume>:<fpage>103076</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2021.103076</pub-id>, PMID: <pub-id pub-id-type="pmid">34457034</pub-id></citation></ref>
<ref id="ref45"><label>45.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Ewen</surname> <given-names>N</given-names></name> <name><surname>Khan</surname> <given-names>N</given-names></name></person-group> (<year>2021</year>) <article-title>Targeted self supervision for classification on a small covid-19 ct scan dataset</article-title>. <conf-name>International symposium on biomedical imaging (ISBI)</conf-name>.</citation></ref>
<ref id="ref46"><label>46.</label> <citation citation-type="other"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>X</given-names></name> <name><surname>He</surname> <given-names>X</given-names></name> <name><surname>Zhao</surname> <given-names>J</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>S</given-names></name> <name><surname>Xie</surname> <given-names>P</given-names></name></person-group>. <article-title>COVID-CT-dataset: a CT scan dataset about COVID-19</article-title>. Ar xiv preprint ar xiv: 2003.13865. (<year>2020</year>).</citation></ref>
<ref id="ref47"><label>47.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ahmed</surname> <given-names>SAA</given-names></name> <name><surname>Yavuz</surname> <given-names>MC</given-names></name> <name><surname>&#x015E;en</surname> <given-names>MU</given-names></name> <name><surname>G&#x00FC;l&#x015F;en</surname> <given-names>F</given-names></name> <name><surname>Tutar</surname> <given-names>O</given-names></name> <name><surname>Korkmazer</surname> <given-names>B</given-names></name> <etal/></person-group>. <article-title>Comparison and ensemble of 2D and 3D approaches for COVID-19 detection in CT images</article-title>. <source>Neurocomputing</source>. (<year>2022</year>) <volume>488</volume>:<fpage>457</fpage>&#x2013;<lpage>69</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neucom.2022.02.018</pub-id>, PMID: <pub-id pub-id-type="pmid">35345875</pub-id></citation></ref>
<ref id="ref48"><label>48.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chaudhary</surname> <given-names>PK</given-names></name> <name><surname>Pachori</surname> <given-names>RB</given-names></name></person-group>. <article-title>FBSED based automatic diagnosis of COVID-19 using X-ray and CT images</article-title>. <source>Comp Biol Med Glob Surv</source>. (<year>2021</year>) <volume>134</volume>:<fpage>104454</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.104454</pub-id>, PMID: <pub-id pub-id-type="pmid">33965836</pub-id></citation></ref>
<ref id="ref49"><label>49.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Z</given-names></name> <name><surname>Liu</surname> <given-names>Q</given-names></name> <name><surname>Dou</surname> <given-names>Q</given-names></name></person-group>. <article-title>Contrastive cross-site learning with redesigned net for COVID-19 CT classification</article-title>. <source>IEEE J Biomed Health Inform</source>. (<year>2020</year>) <volume>24</volume>:<fpage>2806</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2009.07652</pub-id>, PMID: <pub-id pub-id-type="pmid">32915751</pub-id></citation></ref>
<ref id="ref50"><label>50.</label> <citation citation-type="journal"><person-group person-group-type="author"><name><surname>Patel</surname> <given-names>RK</given-names></name> <name><surname>Kashyap</surname> <given-names>M</given-names></name></person-group>. <article-title>Automated diagnosis of COVID stages from lung CT images using statistical features in 2-dimensional flexible analytic wavelet transform</article-title>. <source>Biocybernet Biomed Eng</source>. (<year>2022</year>) <volume>42</volume>:<fpage>829</fpage>&#x2013;<lpage>41</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bbe.2022.06.005</pub-id>, PMID: <pub-id pub-id-type="pmid">35791429</pub-id></citation></ref>
<ref id="ref51"><label>51.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Lin</surname> <given-names>Y</given-names></name> <name><surname>Cao</surname> <given-names>Y</given-names></name> <name><surname>Hu</surname> <given-names>H</given-names></name> <name><surname>Wei</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>Z</given-names></name> <etal/></person-group>. (<year>2021</year>) <article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>. <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>.</citation></ref>
<ref id="ref52"><label>52.</label> <citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Selvaraju</surname> <given-names>RR</given-names></name> <name><surname>Cogswell</surname> <given-names>M</given-names></name> <name><surname>Das</surname> <given-names>A</given-names></name> <name><surname>Vedantam</surname> <given-names>R</given-names></name> <name><surname>Parikh</surname> <given-names>D</given-names></name> <name><surname>Batra</surname> <given-names>D</given-names></name></person-group> <article-title>Grad-cam: visual explanations from deep networks via gradient-based localization</article-title>. <conf-name>IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name> (<year>2017</year>).</citation></ref>
</ref-list>
</back>
</article>