<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">860510</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2022.860510</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>DeepLION: Deep Multi-Instance Learning Improves the Prediction of Cancer-Associated T Cell Receptors for Accurate Cancer Detection</article-title>
<alt-title alt-title-type="left-running-head">Xu et al.</alt-title>
<alt-title alt-title-type="right-running-head">DeepLION Improves CaTCR Prediction</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Xu</surname>
<given-names>Ying</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Qian</surname>
<given-names>Xinyang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1647830/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Xuanping</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lai</surname>
<given-names>Xin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Yuqian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Jiayin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/615156/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Computer Science and Technology</institution>, <institution>School of Electronic and Information Engineering</institution>, <institution>Xi&#x2019;an Jiaotong University</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Institute of Data Science and Information Quality</institution>, <institution>Shaanxi Engineering Research Center of Medical and Health Big Data</institution>, <institution>Xi&#x2019;an Jiaotong University</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/589586/overview">Leyi Wei</ext-link>, Shandong University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1663314/overview">Tian Xia</ext-link>, Huazhong University of Science and Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1666468/overview">Chong Chu</ext-link>, Harvard Medical School, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Jiayin Wang, <email>wangjiayin@mail.xjtu.edu.cn</email>
</corresp>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this work and share first authorship</p>
</fn>
<fn fn-type="other">
<p>This article was submitted to Human and Medical Genomics, a section of the journal Frontiers in Genetics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>11</day>
<month>04</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>860510</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>01</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>02</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Xu, Qian, Zhang, Lai, Liu and Wang.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Xu, Qian, Zhang, Lai, Liu and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Recent studies highlight the potential of T cell receptor (TCR) repertoires in accurately detecting cancers <italic>via</italic> noninvasive sampling. Unfortunately, due to the complicated associations among cancer antigens and the possible induced T cell responses, currently, the practical strategy for identifying cancer-associated TCRs is the computational prediction based on TCR repertoire data. Several state-of-the-art methods were proposed in recent year or two; however, the prediction algorithms were still weakened by two major issues. To facilitate the computational processes, the algorithms prefer to decompose the original TCR sequences into length-fixed amino acid fragments, while the first dilemma comes as the lengths of cancer-associated motifs are suggested to be various. Moreover, the correlations among TCRs in the same repertoire should be further considered, which are often ignored by the existing methods. We here developed a deep multi-instance learning method, named DeepLION, to improve the prediction of cancer-associated TCRs by considering these issues. First, DeepLION introduced a deep learning framework with alternative convolution filters and 1-max pooling operations to handle the amino acid fragments with different lengths. Then, the multi-instance learning framework modeled the TCR correlations and assigned adjusted weights for each TCR sequence during the predicting process. To validate the performance of DeepLION, we conducted a series of experiments on several cohorts of patients from nine cancer types. Compared to the existing methods, DeepLION achieved, on most of the cohorts, higher prediction accuracies, sensitivities, specificities, and areas under the curve (AUCs), where the AUC reached notably 0.97 and 0.90 for thyroid and lung cancer cohorts, respectively. Thus, DeepLION may further support the detection of cancers from TCR repertoire data. DeepLION is publicly available on GitHub, at <ext-link ext-link-type="uri" xlink:href="https://github.com/Bioinformatics7181/DeepLION">https://github.com/Bioinformatics7181/DeepLION</ext-link>, for academic usage only.</p>
</abstract>
<kwd-group>
<kwd>T cell receptor</kwd>
<kwd>TCR repertoire data analysis</kwd>
<kwd>cancer-associated TCR</kwd>
<kwd>machine learning approach</kwd>
<kwd>multi-instance learning</kwd>
<kwd>deep learning framework</kwd>
</kwd-group>
<contract-sponsor id="cn001">Natural Science Basic Research Program of Shaanxi Province<named-content content-type="fundref-id">10.13039/501100017596</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content>
</contract-sponsor>
<contract-sponsor id="cn003">National Key Research and Development Program of China<named-content content-type="fundref-id">10.13039/501100012166</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>T cells that respond to tumor antigens are the central mediators of cancer immunity (<xref ref-type="bibr" rid="B12">Gubin et al., 2014</xref>; <xref ref-type="bibr" rid="B27">Tran et al., 2014</xref>; <xref ref-type="bibr" rid="B28">Tumeh et al., 2014</xref>). For a cancer patient, the T cell repertoire often undergoes some cancer-specific changes during the tumor progression (<xref ref-type="bibr" rid="B24">Schreiber et al., 2011</xref>), where whose T cell receptors (TCRs) are defined as cancer-associated TCRs (caTCRs). It is reported that some caTCRs may share universal biochemical signatures (<xref ref-type="bibr" rid="B3">Chowell et al., 2015</xref>; <xref ref-type="bibr" rid="B19">Li et al., 2016</xref>). Recent studies further indicated that there are shared antigens and TCRs among the patients with the same cancer type or subtype (<xref ref-type="bibr" rid="B16">Kvistborg et al., 2013</xref>; <xref ref-type="bibr" rid="B6">Dhodapkar and Dhodapkar, 2016</xref>). The rapid development of immune repertoire sequencing (IR-seq) (<xref ref-type="bibr" rid="B15">Kirsch et al., 2015</xref>) enables a comprehensive view of TCR repertoires on both individual and population levels. Then, it is natural that several computational frameworks were proposed to predict the caTCRs, some of which further attempted to distinguish the cancer-associated repertoires from those non-cancer ones.</p>
<p>However, accurately predicting the caTCRs is quite challenging work, mainly due to the tremendous heterogeneity on personal antigen landscapes, while the lack of knowledge about the cancer antigens inducing spontaneous T cell responses brings additional difficulty (<xref ref-type="bibr" rid="B5">Coulie et al., 2014</xref>). To complete this work, several studies attempted to mine the biochemical properties of caTCRs from TCR sequencing (TCR-seq) data. The majority of them focused on the TCR&#x3b2; chain complementarity determining region 3 (CDR3) because it primarily determines the antigen-binding specificity as the somatically generated portion of the gene. Given the computational difficulty of analyzing the entire CDR3 sequences, some approaches simplified computational processes by preprocessing the original sequences into length-fixed overlapping adjacent amino acid (AA) fragments and predicted the caTCRs by identifying key motifs in the fragments, but there is currently no consensus on the sequence decomposition strategy. Cinelli et al. decomposed the sequences into triplets (denoted as 3-mers) and then selected the key motifs from the 3-mers with 1-dimensional (1-D) Bayesian classifier to train their support vector machine model for repertoire classification (<xref ref-type="bibr" rid="B4">Cinelli et al., 2017</xref>). Sun et al. adopted Cinelli&#x2019;s sequence decomposition strategy and trained their LPBoost model with the frequencies of 3-mers to identify key motifs and classify repertoires (<xref ref-type="bibr" rid="B25">Sun et al., 2017</xref>). Interestingly, Ostmeyer&#x2019;s study partitioned the sequence into 4-mers (other than 3-mers) based on the analysis of the X-ray crystal structure of human TCR bound to peptide-major histocompatibility complex (MHC) before distinguishing tumor tissue from adjacent healthy tissue in colorectal and breast cancer samples (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>). Furthermore, the X-ray crystal structure analysis (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>) revealed that the size of adjacent CDR3 residues in direct contact with peptide varied from two to eight, implying that the length of the key biochemical motifs in TCRs should not be fixed. Therefore, decomposing sequences into <italic>z</italic>-mers, which limits these approaches to identifying length-fixed key motifs, is considered to lead to information loss and may harm subsequent model classification.</p>
<p>In contrast to the above approaches, some studies investigated the entire CDR3 sequences to consider the correlations among sequences. Emerson et al. built a statistical classification framework that could predict cytomegalovirus (CMV) status from the resulting catalog of CDR3 sequences (<xref ref-type="bibr" rid="B8">Emerson et al., 2017</xref>). Yokota&#x2019;s approach compared the TCR repertoires in low dimensions based on entire sequence information, which estimated the low-dimensional structure after embedding the pairwise high-dimensional sequence dissimilarities (<xref ref-type="bibr" rid="B32">Yokota et al., 2017</xref>). Both of these approaches concentrated on the similarity comparisons among the entire sequences. However, only partial residuals of TCR contribute to antigen-binding specificity (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>), the sequence comparison approaches were unable to focus on these residuals, potentially resulting in poor performances. DeepCAT is a deep learning framework enabling <italic>de novo</italic> prediction of caTCRs (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>). Antigen-specific sequences in each repertoire were selected to be predicted by a set of the trained convolutional neural network (CNN) models after clustering CDR3 sequences based on their similarity (<xref ref-type="bibr" rid="B33">Zhang et al., 2020</xref>). The cancer score, which quantifies the likelihood that a repertoire is associated with cancer, was calculated using the average of the sequence predictions. Compared to the above two approaches, DeepCAT was able to predict the caTCRs more accurately and performed better in repertoire classification due to the cluster analyses in data preprocessing and deep learning&#x2019;s excellent feature extraction ability. However, DeepCAT ignored the correlations among TCRs in the same repertoire for the simple definition of cancer score, which assigned the same weight for all TCRs in a repertoire whereas they may own distinct weights. Ostmeyer&#x2019;s approach (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>) attempted to model the TCR correlations with multi-instance learning (MIL), but it used the standard MIL assumption (<xref ref-type="bibr" rid="B7">Dietterich et al., 1997</xref>; <xref ref-type="bibr" rid="B10">Foulds and Frank, 2010</xref>), predicting the repertoire as cancerous based on the presence of only one abnormal TCR, which is unsuitable and has a risk of increased false positives because a cancer patient&#x2019;s repertoire typically contains many caTCRs that are related to one another.</p>
<p>In summary, there is a dearth of caTCR prediction approaches that take into account the cancer-associated biochemical motifs with various lengths and correctly model the correlations among TCRs in the same repertoire. To bridge this gap, we developed a deep MIL method called DeepLION in this study to improve the prediction of caTCRs using TCR-seq data (<xref ref-type="fig" rid="F1">Figure 1</xref>). On one hand, the CNN with alternative convolution filters and 1-max pooling operations was used to handle AA fragments with different lengths in TCRs, where various lengths of cancer-associated motifs were identified; on the other hand, the MIL part of DeepLION assigned appropriate weights for each TCR after modeling the TCR correlations during the predicting process. We evaluated the performance of DeepLION on several cohorts of patients from nine cancer types and found that it achieved higher prediction accuracies, sensitivities, specificities, and areas under the receiver operating characteristic (ROC) curve (AUCs) for most of the cohorts compared with the current state-of-the-art methods, with the AUCs for thyroid and lung cancer cohorts reaching 0.97 and 0.90, respectively. Thus, DeepLION can accurately predict the caTCRs and distinguish the cancer-associated repertoires from those non-cancer ones, potentially assisting in the detection of malignancies based on TCR repertoire data.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>DeepLION for accurate TCR repertoire prediction. <bold>(A)</bold> The workflow of DeepLION is divided into three parts: data preprocessing, the CNN for TCRs, and MIL. During data preprocessing, the top <italic>k</italic> most abundant TCR sequences were extracted from each repertoire after removing unqualified sequences and they were encoded into matrixes by the Beshnova matrix. The CNN for TCRs consisted of 14 convolution filters covering six various region sizes, 1-max pooling operations, and a one-layer linear classifier L. The TCR matrixes were input to the CNN and their scores were output. In the MIL part, DeepLION employed another one-layer linear classifier L&#x2032; to aggregate <italic>k</italic> TCR scores to predict the repertoire. <bold>(B)</bold> The details of the convolution and pooling operations of CNN in DeepLION. When a 2 &#xd7; <italic>d</italic> convolution filter (the red box) performed a complete convolution operation on the TCR matrix from top to bottom, it could be regarded as extracting the biochemical features of the 2-mers such as "CA", "AS", etc., and then a 10 &#xd7; 1 feature map, a feature set of all 2-mers, was generated. Other filters performed similar convolution operations and 14 feature maps were obtained. The maximum value of each map (marked with a blue box) was selected by a 1-max pooling operation, which could be viewed as the feature of the <italic>z</italic>-mers most likely to be the cancer-specific motif. These features were interconnected to generate a 14 &#xd7; 1 TCR feature vector.</p>
</caption>
<graphic xlink:href="fgene-13-860510-g001.tif"/>
</fig>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and Methods</title>
<p>DeepLION is a deep MIL method based on TCR-seq data for caTCR prediction. The workflow of DeepLION consisted of three parts: data preprocessing, the CNN for TCRs, and MIL (<xref ref-type="fig" rid="F1">Figure 1A</xref>). First, <italic>k</italic> TCR&#x3b2; CDR3 sequences with the highest abundance in the repertoire were selected and encoded into the matrixes using AA biochemical features. After data preprocessing, the TCR matrixes with different sizes were fed into a deep CNN model, where the biochemical features of key motifs with various lengths were extracted by alternative convolution filters and 1-max pooling operations, and then the scores of TCRs, the probabilities that they were caTCRs, were calculated. Finally, a one-layer linear classifier L&#x2032; was employed to aggregate <italic>k</italic> TCR scores as the cancer score of the repertoire, which is used to predict whether it is cancerous.</p>
<sec id="s2-1">
<title>Data Preprocessing</title>
<p>We collected the CDR3 of TCR&#x3b2; from TCR-seq data to study. Considering that low-quality CDR3 sequences will affect the downstream analysis, the following types of sequences were removed as described in the previous study (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>): <italic>1</italic>) too short (&#x3c;10) or too long (&#x3e;24) sequences; <italic>2</italic>) sequences containing special characters (X, &#x2b;, &#x2217;, etc.); <italic>3</italic>) incomplete sequences, according to the ImMunoGeneTics (IMGT) nomenclature (<xref ref-type="bibr" rid="B18">Lefranc et al., 2015</xref>), not starting with the cysteine (C) or not ending with the phenylalanine (F); and <italic>4</italic>) sequences where variable gene locus was not solved. There were some overlapping TCR sequences between healthy individuals and cancer patients, which were considered irrelevant to cancer and therefore also needed to be excluded. We used known training data to generate a reference dataset (denoted as D<sub>R</sub>) containing sequences that appeared at a high frequency in both healthy individuals and cancer patients (the top 20,000 sequences with the highest cloning frequency in each TCR-seq sample). Any sequences in each sample appearing in the D<sub>R</sub> were removed. After the above sequences were removed, the remaining TCR sequences were sorted in descending order of cloning frequency and the top <italic>k</italic> sequences were extracted for downstream analysis.</p>
<p>The raw TCR sequences were not directly input to the CNN because their antigen-binding ability was not well represented. AAs in TCR sequences can be represented by biochemical features, and a TCR sequence with length <italic>l</italic> is able to be encoded by a 20 &#xd7; <italic>d</italic> feature matrix for 20 AAs into an <italic>l</italic> &#xd7; <italic>d</italic> TCR matrix. The AA index database (<xref ref-type="bibr" rid="B13">Kawashima and Kanehisa, 2000</xref>) documented 566 AA indices containing rich biochemical information based on previous literature. Since the original 566 AA indices are very large, directly utilizing them to characterize AAs may lead to a large input data size and too many parameters of CNN, which may cause problems such as high computational complexity and overfitting. In addition, many of the original AA indices are highly correlated with each other. As a result, we considered dimensionality reduction of the original AA indices. Currently, many studies have extracted low-dimensional orthogonal features from high-dimensional AA indices, reducing the dimensionality of a large number of AA indices with minimal information loss. Kidera et al. derived a 20 &#xd7; 10 feature matrix from 188 AA indices (<xref ref-type="bibr" rid="B14">Kidera et al., 1985</xref>), and Atchley et al. derived a 20 &#xd7; 5 feature matrix from 494 AA indices (<xref ref-type="bibr" rid="B1">Atchley et al., 2005</xref>). Beshnova et al. employed principal component analysis (PCA) to generate a 20 &#xd7; 15 feature matrix from 531 AA indices to characterize AAs (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>). Considering that the Beshnova matrix was obtained from the largest number of AA indices and encompassed the most biochemical information (explaining more than 95% of the variation in the data), we used the Beshnova matrix in our experiments (<italic>d</italic> &#x3d; 15) to encode the TCR sequences into matrixes.</p>
</sec>
<sec id="s2-2">
<title>Identifying Various Lengths of Cancer-Associated Motifs</title>
<p>The CNN is able to predict whether a TCR sequence is associated with cancer. TextCNN in natural language processing (NLP) firstly applied the CNN model to sequence analysis (<xref ref-type="bibr" rid="B34">Zhang and Wallace, 2015</xref>). Referring to the idea of TextCNN, the model consisted of a convolutional layer, a pooling layer, and a linear layer, as shown in <xref ref-type="fig" rid="F1">Figure 1A</xref>. Different from TextCNN, we developed innovative convolution filters in the convolutional layer to handle the amino acid fragments with different lengths according to the X-ray crystal structure analysis (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>). For a TCR matrix with dimension <italic>l</italic> &#xd7; <italic>d</italic>, the model extracted its features by a set of convolution filters with various sizes respectively, performed 1-max pooling operations for the low-dimensional feature maps obtained by each convolution filter, and concatenated the pooled results to generate a TCR feature vector. The TCR score, the probability of being a caTCR, was obtained by a one-layer linear classifier finally.</p>
<p>The convolutional layer of the model was designed with multiple convolution filters to extract the key biochemical motifs with distinct lengths in TCRs. By analyzing the X-ray crystal structure, Ostmeyer et al. identified the CDR3 residues in contact with peptide, which were thought to make the greatest contribution to the antigen-binding specificity of the TCRs, and determined the length of the AA fragments according to the analysis (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>). We counted the contiguous CDR3 residue regions (size &#x2265;2) in the 55 CDR3 sequences used for analysis (<xref ref-type="table" rid="T1">Table 1</xref>) and observed that the regions ranging in size from 2 to 8 were present, with regions of sizes 2&#x2013;4 occurring more frequently and regions of size eight occurring least frequently at 0.017. These contiguous regions were considered as potential cancer-specific motifs that contribute to the antigen-binding ability of TCRs. After excluding the regions with low frequencies (&#x3c;0.05), we designed a set of various convolution filters and specified the number of corresponding convolution filters according to the occurrence frequencies of the regions (<xref ref-type="table" rid="T1">Table 1</xref>). As shown in <xref ref-type="table" rid="T1">Table 1</xref> and <xref ref-type="fig" rid="F1">Figure 1A</xref>, the convolution filters with six different sizes were designed to extract features from TCR matrixes, and the number of convolution filters with each size was positively correlated with the occurrence frequency of the corresponding region, for a total of 14 convolution filters. Every complete convolution operation is defined as:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">W</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mtext>T</mml:mtext>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where the output sequence <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is composed of the results of each convolution <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, the activation function <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the Rectified Linear Unit (ReLU), <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">W</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are respectively the weight matrix and bias of the <italic>j</italic>th convolution filter <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with size <inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf8">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the <italic>i</italic>th TCR matrix, <inline-formula id="inf9">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the submatrix from row <italic>a</italic> to row <italic>b</italic> in <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and &#x2219; denotes the dot product (a sum of element-wise multiplications) between the submatrix and the filter. <xref ref-type="disp-formula" rid="e1">Equation 1</xref> showed the convolution operation that one convolution filter performed on the TCR matrix, and then the corresponding feature map, <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">o</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, was obtained. Taking the 11 &#xd7; <italic>d</italic> TCR matrix shown in <xref ref-type="fig" rid="F1">Figure 1B</xref> as an example, when the convolution filter with size 2 &#xd7; <italic>d</italic> performed a complete convolution operation on the matrix from top to bottom, it could be regarded as extracting the biochemical features of the 2-mers in TCRs such as "CA", "AS", etc., and finally obtaining the 10 &#xd7; 1 feature map, the feature set of all 2-mers, which probably included cancer-specific motifs. The filters with other sizes performed similar convolution operations. Because the frequencies of key motifs with different lengths varied, the numbers of convolution filters with various sizes were adjusted to give them appropriate weights in the model. ReLU was chosen as the activation function following the convolution operations due to its low computational complexity. Additionally, it can avoid the vanishing gradient or exploding gradient problems that Sigmoid and Tanh can cause. And the disadvantage of ReLU, the dead ReLU problem, was mitigated by using the Xavier initialization (<xref ref-type="bibr" rid="B11">Glorot and Bengio, 2010</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>The situation of continuous CDR3 residue regions and convolution filter design.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Size of region</th>
<th align="center">Number of region</th>
<th align="center">Frequency of region</th>
<th align="center">Size of filter</th>
<th align="center">Number of filter</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">2</td>
<td align="center">12</td>
<td align="char" char=".">0.207</td>
<td align="center">2 &#xd7; <italic>d</italic>
</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">3</td>
<td align="center">12</td>
<td align="char" char=".">0.207</td>
<td align="center">3 &#xd7; <italic>d</italic>
</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">4</td>
<td align="center">13</td>
<td align="char" char=".">0.224</td>
<td align="center">4 &#xd7; <italic>d</italic>
</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">5</td>
<td align="center">8</td>
<td align="char" char=".">0.138</td>
<td align="center">5 &#xd7; <italic>d</italic>
</td>
<td align="center">2</td>
</tr>
<tr>
<td align="left">6</td>
<td align="center">7</td>
<td align="char" char=".">0.121</td>
<td align="center">6 &#xd7; <italic>d</italic>
</td>
<td align="center">2</td>
</tr>
<tr>
<td align="left">7</td>
<td align="center">5</td>
<td align="char" char=".">0.086</td>
<td align="center">7 &#xd7; <italic>d</italic>
</td>
<td align="center">1</td>
</tr>
<tr>
<td align="left">8</td>
<td align="center">1</td>
<td align="char" char=".">0.017</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2014;</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The 1-max pooling function was adopted to pool the feature maps generated after convolution operations of each convolution filter, reducing the mapping dimension to 1. The following pooling functions are commonly used: <italic>1</italic>) the element-wise maximum function (<inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>Max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>); <italic>2</italic>) the element-wise average function (<inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>Avg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>); <italic>3</italic>) the log-sum-exp (LSE) function (<inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>LSE</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) (<xref ref-type="bibr" rid="B23">Sahasrabudhe et al., 2021</xref>). These are defined as:<disp-formula id="e2">
<mml:math id="m16">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>Max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m17">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>Avg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mtext>and</mml:mtext>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m18">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>LSE</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:mi>exp</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf15">
<mml:math id="m19">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the output sequence of <inline-formula id="inf16">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with size <inline-formula id="inf17">
<mml:math id="m21">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> after the convolution operation on <inline-formula id="inf18">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The feature maps obtained could be viewed as the feature sets of all <italic>z</italic>-mers <inline-formula id="inf19">
<mml:math id="m23">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Regarding that the goal of CNN was distilling the potential cancer-specific motifs in TCRs, we focused on the most contributing features in the feature sets, which are most likely from the key motifs, and ignored the features of other <italic>z</italic>-mers. With the 1-max pooling function, the most contributing features of each feature set were extracted and other non-key motifs&#x2019; features were discarded, whereas the features pooled by the average function and the LSE function were affected by other non-key motifs, which caused adverse effects on the classification ability of the model. Considering that one TCR may contain multiple short key motifs with the same length, the drawback that the 1-max pooling operation can only extract the feature of one key motif from one sequence can be compensated by the design of multiple convolution filters with the same size in the convolutional layer. As shown in <xref ref-type="fig" rid="F1">Figure 1B</xref>, the most contributing features (marked with blue boxes) extracted by pooling were interconnected to generate a 14 &#xd7; 1 TCR feature vector as:<disp-formula id="e5">
<mml:math id="m24">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>Max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">&#x1d4f8;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mn>14</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mtext>T</mml:mtext>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<xref ref-type="disp-formula" rid="e5">Equation 5</xref> showed the 1-max pooling operation performed on the feature map, and then the TCR feature vector, <inline-formula id="inf20">
<mml:math id="m25">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, was obtained, which consisted of the most contributing features from the feature maps.</p>
<p>Ultimately, a one-layer linear classifier L was applied to aggregate the features extracted from each convolution kernel and predict the score for that TCR sequence. L that assigned scores to the TCRs is given by:<disp-formula id="e6">
<mml:math id="m26">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">W</mml:mi>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">L</mml:mi>
<mml:mi mathvariant="italic">T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mtext>L</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf21">
<mml:math id="m27">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the probability that the TCR is associated with cancer, the activation function <inline-formula id="inf22">
<mml:math id="m28">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the sigmoid function to normalize the scores, and <inline-formula id="inf23">
<mml:math id="m29">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">W</mml:mi>
<mml:mtext>L</mml:mtext>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mn>14</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf24">
<mml:math id="m30">
<mml:mrow>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mtext>L</mml:mtext>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are respectively the weight matrix and bias of L. <xref ref-type="disp-formula" rid="e6">Equation 6</xref> showed the operation process in L, and the probability that the TCR was associated with cancer was obtained. The TCR was predicted to be caTCR when <inline-formula id="inf25">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and was otherwise predicted to be noncancerous. A multi-layer linear classifier is capable of fitting the data better, but it also makes the structure of CNN more complicated and introduces the risk of overfitting. To reduce overfitting, a one-layer linear classifier is applied to the model to predict the TCR scores.</p>
<p>The CNN model jointly learns the various convolution filters and L so that it is end-to-end trainable and the preprocessed TCRs and the corresponding labels are needed for model training. Because the probability that a TCR is a caTCR obeys a Bernoulli distribution, the log-likelihood function (also known as the cross-entropy function) was used as the loss function to train the model, which is defined as:<disp-formula id="e7">
<mml:math id="m32">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>CNN</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mtext>ln</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mtext>ln</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>During the training process, random dropouts at a rate of 40% were applied to L to mitigate overfitting.</p>
<p>Because zero-padding will alter the distribution of input data, DeepCAT (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>) built five distinct CNN models for TCR sequences from 12 to 16 in length, which made itself cumbersome and unable to utilize the information of TCRs with other lengths, whereas TCRs with different lengths were processed by one model in this method. In the actual training process, we added zeros to the end of the shorter sequences to achieve the length of the longest sequences (<italic>l</italic> &#x3d; 24) to ensure the consistency of the dimensionality of the input TCR matrixes. Because the features of all motifs containing zero viewed as non-key motifs were discarded after the 1-max pooling operations in the model, the classification ability of the model did not deteriorate.</p>
</sec>
<sec id="s2-3">
<title>Multi-Instance Learning Modeling the TCR Correlations</title>
<p>Predicting whether a repertoire is cancer-associated from the TCRs in every repertoire can be described as MIL, where the TCRs are instances and the repertoires are bags. The standard MIL assumption assumes that each instance in a bag can be classified as either positive (1) or negative (0), and the label of a bag is 1 when including one or more positive instances (<xref ref-type="bibr" rid="B7">Dietterich et al., 1997</xref>; <xref ref-type="bibr" rid="B10">Foulds and Frank, 2010</xref>). Ostmeyer et al. applied this assumption in their study (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>). However, it is inappropriate to predict the repertoire as having cancer by the presence of a non-normal TCR because a cancer patient usually contains many caTCRs, which are somehow related to each other. In addition, the needed labels of TCRs are unknown, which means that it is difficult to know whether a TCR is associated with cancer or not. Although Beshnova et al. obtained potential caTCRs used for model training from TCGA tumor RNA-seq samples in advance in their study by TRUST (<xref ref-type="bibr" rid="B20">Li et al., 2017</xref>) and sequence filtering based on a reference database (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>), the evidence that all sequences were positive was lacking. The definition of cancer score for the repertoire by averaging the predictions for TCRs in a repertoire was also inaccurate because we could not prove that all TCRs enjoyed the same weight. Therefore, we designed a one-layer linear classifier <inline-formula id="inf26">
<mml:math id="m33">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as an aggregating function to combine the scores of <italic>k</italic> TCRs collected from repertoires to predict the repertoire. <inline-formula id="inf27">
<mml:math id="m34">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is defined as:<disp-formula id="e8">
<mml:math id="m35">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msub>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">W</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
<mml:mi mathvariant="normal">T</mml:mi>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mtext>T</mml:mtext>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <inline-formula id="inf28">
<mml:math id="m36">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the probability that the repertoire has cancer, the activation function <inline-formula id="inf29">
<mml:math id="m37">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the sigmoid function, and <inline-formula id="inf30">
<mml:math id="m38">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf31">
<mml:math id="m39">
<mml:mrow>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:msup>
<mml:mtext>L</mml:mtext>
<mml:mtext>&#x2032;</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are respectively the weight matrix and bias of <inline-formula id="inf32">
<mml:math id="m40">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. <xref ref-type="disp-formula" rid="e8">Equation 8</xref> showed the operation process in <inline-formula id="inf33">
<mml:math id="m41">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, which was similar to <xref ref-type="disp-formula" rid="e6">Eq. 6</xref>, and the probability that the repertoire was associated with cancer was obtained. The repertoire was predicted to be cancer-associated when <inline-formula id="inf34">
<mml:math id="m42">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and was otherwise predicted to be noncancerous. A MIL model consisting of the CNN and <inline-formula id="inf35">
<mml:math id="m43">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is also end-to-end trainable, whose loss function is the log-likelihood function defined as:<disp-formula id="e9">
<mml:math id="m44">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>MIL</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mtext>ln</mml:mtext>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mtext>ln</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>Instead of simply taking the max value (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>) or the average (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>) value of all TCR scores as the cancer score of the repertoire, <inline-formula id="inf36">
<mml:math id="m45">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> was capable of learning the interrelationships among TCRs and assigning the appropriate weights to each TCR after model training. Similar to the idea of L, the multi-layer linear classifier was replaced by <inline-formula id="inf37">
<mml:math id="m46">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and random dropouts at a rate of 40% were applied to <inline-formula id="inf38">
<mml:math id="m47">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mi mathvariant="normal">&#x2032;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> during training in order to alleviate overfitting.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<p>We conducted a series of experiments on several cohorts of patients covering multiple cancers and healthy donors to validate the performance of DeepLION. In <xref ref-type="sec" rid="s3-1">section 3.1</xref>, we detailed how we acquired the data for the experiments. In <xref ref-type="sec" rid="s3-2">section 3.2</xref>, we assessed the capacity of the CNN framework in DeepLION to predict the caTCRs. In <xref ref-type="sec" rid="s3-3">section 3.3</xref>, we evaluated the performance of the entire DeepLION when predicting repertoires.</p>
<sec id="s3-1">
<title>Collecting the Data</title>
<p>We used the publicly available TCR-seq data from Adaptive Biotechnologies immuneACCESS online database (IA), which contains eight groups of peripheral blood mononuclear cell (PBMC) samples with diverse cancer types and one group of non-cancer PBMC samples. To validate the performance of the models on Asian patients, the TCR-seq data from the clinical database of Geneplus Technology Ltd. in Shenzhen (Geneplus) were also used (<xref ref-type="bibr" rid="B17">Lan et al., 2020</xref>; <xref ref-type="bibr" rid="B21">Li et al., 2021</xref>), which included PBMC and tumor-infiltrating T lymphocyte (TIL) samples from patients with thyroid cancer (THCA) and lung cancer, as well as non-cancer PBMC samples. <xref ref-type="table" rid="T2">Table 2</xref> shows the specifics of the various datasets that were used in the experiments.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>The specifics of the datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Source</th>
<th align="center">Disease</th>
<th align="center">Cell type</th>
<th align="center">Data type</th>
<th align="center">Sample size</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="9" align="left">IA</td>
<td align="left">Melanoma</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">21</td>
</tr>
<tr>
<td align="left">BRCA</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">16</td>
</tr>
<tr>
<td align="left">Ovarian</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">4</td>
</tr>
<tr>
<td align="left">Pancreatic</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">7</td>
</tr>
<tr>
<td align="left">Bladder</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">30</td>
</tr>
<tr>
<td align="left">GBM</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">15</td>
</tr>
<tr>
<td align="left">Lung</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">29</td>
</tr>
<tr>
<td align="left">CRC</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">Non-cancer</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">786</td>
</tr>
<tr>
<td rowspan="3" align="left">Geneplus</td>
<td align="left">THCA</td>
<td align="left">PBMCs and TILs</td>
<td align="center">TCR-seq</td>
<td align="center">170</td>
</tr>
<tr>
<td align="left">Lung</td>
<td align="left">PBMCs and TILs</td>
<td align="center">TCR-seq</td>
<td align="center">184</td>
</tr>
<tr>
<td align="left">Non-cancer</td>
<td align="left">PBMCs</td>
<td align="center">TCR-seq</td>
<td align="center">260</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>IA, Adaptive Biotechnologies immuneACCESS online database; BRCA, breast cancer; GBM, glioblastoma multiforme; CRC, colorectal cancer; THCA, thyroid cancer; PBMCs, peripheral blood mononuclear cells; TILs, tumor-infiltrating T lymphocytes; TCR-seq, T cell receptor-sequencing.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="sec" rid="s3-2">section 3.2</xref>, we gathered the training data of DeepCAT to train the CNN framework in DeepLION. The label-encoded training data was composed of cancer (<italic>n</italic> &#x3d; 30,000) and non-cancer (<italic>n</italic> &#x223c; 60,000) TCR CDR3 sequences. The cancer sequences were derived from The Cancer Genome Atlas (TCGA) (<xref ref-type="bibr" rid="B26">Tomczak et al., 2015</xref>) tumor RNA-sequencing (RNA-seq) samples covering multiple cancers using TRUST algorithm (<xref ref-type="bibr" rid="B20">Li et al., 2017</xref>), whereas non-cancer sequences were derived from the healthy individuals H<sub>2</sub> (<italic>n</italic> &#x3d; 120) (<xref ref-type="bibr" rid="B8">Emerson et al., 2017</xref>), which are independent of healthy individuals H<sub>1</sub> (<italic>n</italic> &#x3d; 666). The test data consisted of two datasets (T<sub>1</sub> and T<sub>2</sub>). T<sub>1</sub> consisted of eight groups of samples, each of which contained cancer and H<sub>1</sub> PBMC samples from IA. All TCR-seq data in T<sub>2</sub> were obtained from Geneplus, which were collected from Asian populations. T<sub>2</sub> was composed of two groups of samples: the first group contained THCA PBMC &#x26; TIL samples (<italic>n</italic> &#x3d; 170) and the PBMC samples from the healthy individuals H<sub>3</sub> (<italic>n</italic> &#x3d; 260), and the second group contained lung cancer PBMC and TIL samples (<italic>n</italic> &#x3d; 184) and H<sub>3</sub> PBMC samples. <xref ref-type="table" rid="T3">Table 3</xref> shows the training and test data in <xref ref-type="sec" rid="s3-2">section 3.2</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>The training and test data in experiments.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Section</th>
<th align="center">Data type</th>
<th align="center">Data source</th>
<th align="center">Disease</th>
<th align="center">Cell type</th>
<th align="center">Sample size</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="14" align="left">3.2</td>
<td rowspan="2" align="left">Training data</td>
<td align="left">TCGA</td>
<td align="left">Multiple cancers</td>
<td align="left">&#x2014;</td>
<td align="center">30,000</td>
</tr>
<tr>
<td align="left">IA</td>
<td align="left">Non-cancer (H<sub>2</sub>)</td>
<td align="left">&#x2014;</td>
<td align="center">&#x223c;60,000</td>
</tr>
<tr>
<td rowspan="12" align="left">Test data</td>
<td rowspan="9" align="left">IA (T<sub>1</sub>)</td>
<td align="left">Melanoma</td>
<td align="left">PBMCs</td>
<td align="center">21</td>
</tr>
<tr>
<td align="left">BRCA</td>
<td align="left">PBMCs</td>
<td align="center">16</td>
</tr>
<tr>
<td align="left">Ovarian</td>
<td align="left">PBMCs</td>
<td align="center">4</td>
</tr>
<tr>
<td align="left">Pancreatic</td>
<td align="left">PBMCs</td>
<td align="center">7</td>
</tr>
<tr>
<td align="left">Bladder</td>
<td align="left">PBMCs</td>
<td align="center">30</td>
</tr>
<tr>
<td align="left">GBM</td>
<td align="left">PBMCs</td>
<td align="center">15</td>
</tr>
<tr>
<td align="left">Lung</td>
<td align="left">PBMCs</td>
<td align="center">29</td>
</tr>
<tr>
<td align="left">CRC</td>
<td align="left">PBMCs</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">Non-cancer (H<sub>1</sub>)</td>
<td align="left">PBMCs</td>
<td align="center">666</td>
</tr>
<tr>
<td rowspan="3" align="left">Geneplus (T<sub>2</sub>)</td>
<td align="left">THCA</td>
<td align="left">PBMCs and TILs</td>
<td align="center">170</td>
</tr>
<tr>
<td align="left">Lung</td>
<td align="left">PBMCs and TILs</td>
<td align="center">184</td>
</tr>
<tr>
<td align="left">Non-cancer (H<sub>3</sub>)</td>
<td align="left">PBMCs</td>
<td align="center">260</td>
</tr>
<tr>
<td rowspan="3" align="left">3.3</td>
<td rowspan="3" align="left">Training and test data</td>
<td rowspan="3" align="left">Geneplus (T<sub>2</sub>)</td>
<td align="left">THCA</td>
<td align="left">PBMCs and TILs</td>
<td align="center">170</td>
</tr>
<tr>
<td align="left">Lung</td>
<td align="left">PBMCs and TILs</td>
<td align="center">184</td>
</tr>
<tr>
<td align="left">Non-cancer (H<sub>3</sub>)</td>
<td align="left">PBMCs</td>
<td align="center">260</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>TCGA, The Cancer Genome Atlas; IA, Adaptive Biotechnologies immuneACCESS online database; BRCA, breast cancer; GBM, glioblastoma multiforme; CRC, colorectal cancer; THCA, thyroid cancer; PBMCs, peripheral blood mononuclear cells; TILs, tumor-infiltrating T lymphocytes.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="sec" rid="s3-3">section 3.3</xref>, we used T<sub>2</sub> to train and test the entire DeepLION due to the larger sizes of cancer samples in T<sub>2</sub> (<xref ref-type="table" rid="T3">Table 3</xref>). The samples in each group were randomly divided into five equal parts, three of which were used as the training set, one as the validation set, and one as the test set.</p>
</sec>
<sec id="s3-2">
<title>Predictive Capacity Evaluation of the CNN Framework in DeepLION for TCRs</title>
<p>DeepCAT (M<sup>CAT</sup>) (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>) is currently a preferred method for <italic>de novo</italic> caTCR prediction from the peripheral blood. To validate the performance of the CNN framework in DeepLION (M<sup>CNN</sup>) when predicting the caTCRs, we conducted experiments to compare M<sup>CNN</sup> with M<sup>CAT</sup>. M<sup>CAT</sup> and M<sup>CNN</sup> shared the same training data (<xref ref-type="table" rid="T3">Table 3</xref>), and we processed the data as described in M<sup>CAT</sup>.</p>
<p>Before the training process, all the training sequences were encoded into <italic>l</italic> &#xd7; 15 TCR matrixes (<italic>d</italic> &#x3d; 15) by the Beshnova matrix (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>). To gain the final model, we trained M<sup>CNN</sup> five times independently. In each training process, we randomly selected two-thirds of the training data (20,000 cancer and 40,000 non-cancer samples) as the training set and used them to train the model for 1,000 epochs at a learning rate of 0.001 with the assurance that the model reached convergence. The remaining training data were used as the validation set and AUCs were estimated to evaluate the trained models. Ultimately, the five trained models had similar AUCs, and the model with the highest AUC (0.85) was selected as the final model. The trained M<sup>CAT</sup> was obtained from Github.</p>
<p>All the test data (<xref ref-type="table" rid="T3">Table 3</xref>) were processed in several steps. First, the top 10,000 most abundant sequences (<italic>k &#x3d;</italic> 10,000) of each sample were extracted after low-quality sequences were removed. Second, we used iSMART (<xref ref-type="bibr" rid="B33">Zhang et al., 2020</xref>) with default parameters to cluster the sequences, and then the antigen-specific sequences were selected. Third, all the sequences were encoded into TCR matrixes for the downstream analysis.</p>
<p>The trained M<sup>CAT</sup>, as well as the trained M<sup>CNN</sup>, was applied to the processed test data, and the cancer score of each sample was defined by averaging all the input sequence scores in the sample. The sensitivities (at 0.9 specificities) and AUCs of both models were estimated for evaluating the models (<xref ref-type="table" rid="T4">Table 4</xref>). The results showed that M<sup>CNN</sup> performed significantly better than M<sup>CAT</sup> in terms of both the sensitivities and AUCs on each group of samples except for the melanoma, ovarian cancer, and colorectal cancer samples in T<sub>1</sub>, where M<sup>CNN</sup>&#x2019;s performance was close to that of M<sup>CAT</sup>.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>The performance of models on different cancer samples.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th colspan="8" align="center">T<sub>1</sub> <xref ref-type="table-fn" rid="Tfn1">
<sup>a</sup>
</xref>
</th>
<th colspan="2" align="center">T<sub>2</sub>
</th>
</tr>
<tr>
<th align="left"/>
<th colspan="2" align="center">Melanoma (21)</th>
<th colspan="2" align="center">BRCA (16)</th>
<th colspan="2" align="center">Ovarian (4)</th>
<th colspan="2" align="center">Pancreatic (7)</th>
<th colspan="2" align="center">THCA (170)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left"/>
<td align="center">M<sup>CAT</sup> <xref ref-type="table-fn" rid="Tfn2">
<sup>b</sup>
</xref>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
</tr>
<tr>
<td align="left">SEN</td>
<td align="char" char=".">0.762</td>
<td align="char" char=".">0.762</td>
<td align="char" char=".">0.438</td>
<td align="char" char=".">0.750</td>
<td align="char" char=".">1</td>
<td align="char" char=".">1</td>
<td align="char" char=".">0.714</td>
<td align="center">1</td>
<td align="char" char=".">0.353</td>
<td align="char" char=".">0.453</td>
</tr>
<tr>
<td align="left">AUC</td>
<td align="char" char=".">0.912</td>
<td align="char" char=".">0.900</td>
<td align="char" char=".">0.854</td>
<td align="char" char=".">0.892</td>
<td align="char" char=".">0.988</td>
<td align="char" char=".">0.989</td>
<td align="char" char=".">0.945</td>
<td align="center">0.962</td>
<td align="char" char=".">0.692</td>
<td align="char" char=".">0.724</td>
</tr>
<tr>
<td align="left"/>
<td colspan="2" align="center">
<bold>Bladder (30)</bold>
</td>
<td colspan="2" align="center">
<bold>GBM (15)</bold>
</td>
<td colspan="2" align="center">
<bold>Lung (29)</bold>
</td>
<td colspan="2" align="center">
<bold>CRC (3)</bold>
</td>
<td colspan="2" align="center">
<bold>Lung (184)</bold>
</td>
</tr>
<tr>
<td align="left"/>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
<td align="center">M<sup>CAT</sup>
</td>
<td align="center">M<sup>CNN</sup>
</td>
</tr>
<tr>
<td align="left">SEN</td>
<td align="char" char=".">0.733</td>
<td align="char" char=".">0.767</td>
<td align="char" char=".">0.133</td>
<td align="char" char=".">0.133</td>
<td align="char" char=".">0.241</td>
<td align="char" char=".">0.310</td>
<td align="center">1</td>
<td align="center">1</td>
<td align="char" char=".">0.326</td>
<td align="char" char=".">0.473</td>
</tr>
<tr>
<td align="left">AUC</td>
<td align="char" char=".">0.881</td>
<td align="char" char=".">0.913</td>
<td align="char" char=".">0.665</td>
<td align="char" char=".">0.690</td>
<td align="char" char=".">0.535</td>
<td align="char" char=".">0.663</td>
<td align="center">1</td>
<td align="center">0.995</td>
<td align="char" char=".">0.736</td>
<td align="char" char=".">0.753</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>SEN, sensitivity; AUC, area under the receiver operating characteristic curve; BRCA, breast cancer; GBM, glioblastoma multiforme; CRC, colorectal cancer; THCA, thyroid cancer.</p>
</fn>
<fn id="Tfn1">
<label>a</label>
<p>Each group of samples was a mix of cancer and control samples (n &#x3d; 666 for groups of T<sub>1</sub> and n &#x3d; 260 for groups of T<sub>2</sub>).</p>
</fn>
<fn id="Tfn2">
<label>b</label>
<p>The thresholds of two models were set at 0.9 specificity (M<sup>CAT</sup>: 0.277 for T<sub>1</sub> and 0.351 for T<sub>2</sub>; M<sup>CNN</sup>: 0.392 for T<sub>1</sub> and 0.423 for T<sub>2</sub>).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To further compare the performance of these two models, M<sup>CAT</sup> and M<sup>CNN</sup> were also applied to the combined cancer samples of T<sub>1</sub> and T<sub>2</sub>, which contained all various cancer samples as well as the control samples, and the ROC curves were generated based on the prediction results (<xref ref-type="fig" rid="F2">Figure 2</xref>). As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, the AUCs of M<sup>CNN</sup> were higher than those of M<sup>CAT</sup> on both T<sub>1</sub> and T<sub>2</sub> (T1: 0.83 &#x3e; 0.78; T2: 0.73 &#x3e; 0.71), indicating that it had the better feature extraction and prediction ability for TCRs.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The ROC curves of models on combined cancer samples. <bold>(A)</bold> The ROC curves on T<sub>1</sub>. The AUC of M<sup>CAT</sup> is 0.78 whereas the AUC of M<sup>CNN</sup> is 0.83. <bold>(B)</bold> The ROC curves on T<sub>2</sub>. The AUC of M<sup>CAT</sup> is 0.71 whereas the AUC of M<sup>CNN</sup> is 0.73.</p>
</caption>
<graphic xlink:href="fgene-13-860510-g002.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>Performance Assessment of the Entire DeepLION</title>
<p>In comparison with M<sup>CAT</sup> and M<sup>CNN</sup>, we expected that the entire DeepLION (M<sup>LION</sup>) as a MIL method was capable of achieving more accurate caTCR prediction after correctly modeling the correlations among TCRs. Because the logistic regression model combined with MIL (denoted as M<sup>LOG</sup>) (<xref ref-type="bibr" rid="B22">Ostmeyer et al., 2019</xref>) is a classical MIL method to distinguish tumor tissues from healthy tissues accurately by identifying the cancer-specific motifs in TCRs, we also compared M<sup>LION</sup> with it.</p>
<p>We trained two models for THCA and lung cancer (denoted as <inline-formula id="inf39">
<mml:math id="m48">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>T</mml:mtext>
<mml:mrow>
<mml:mtext>LION</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf40">
<mml:math id="m49">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>LION</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>) respectively (<xref ref-type="table" rid="T3">Table 3</xref>). To obtain <inline-formula id="inf41">
<mml:math id="m50">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>T</mml:mtext>
<mml:mrow>
<mml:mtext>LION</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, we first extracted the top 100 most abundant sequences (<italic>k</italic> &#x3d; 100) from each sample and encoded them into <italic>l</italic> &#xd7; 15 TCR matrixes (<italic>d</italic> &#x3d; 15) by the Beshnova matrix (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>). Then, we trained the model five times independently, in each of which the model was trained with the training set for 700 epochs at a learning rate of 0.001 with the assurance that the model reached convergence. The AUCs of the trained models on the validation set were estimated for model selection, and the model with the highest AUC (0.95) was selected as the final model. <inline-formula id="inf42">
<mml:math id="m51">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>LION</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> (AUC &#x3d; 0.86) was obtained by the same training procedure. After using the same training sets as M<sup>LION</sup> to train M<sup>LOG</sup> with default training parameters, we also obtained <inline-formula id="inf43">
<mml:math id="m52">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>T</mml:mtext>
<mml:mrow>
<mml:mtext>LOG</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf44">
<mml:math id="m53">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>LOG</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for THCA and lung cancer respectively.</p>
<p>Four trained models were applied to the corresponding test sets and the accuracies, sensitivities, specificities, and AUCs were calculated based on the sample predictions to evaluate their performances (<xref ref-type="table" rid="T5">Table 5</xref>). The results of M<sup>CAT</sup> and M<sup>CNN</sup> on the samples were also shown in <xref ref-type="table" rid="T5">Table 5</xref> for comparison. For a fair comparison, the classification thresholds of M<sup>CAT</sup> and M<sup>CNN</sup> for the samples (M<sup>CAT</sup>: 0.336 for THCA and 0.321 for lung cancer; M<sup>CNN</sup>: 0.433 for THCA and 0.419 for lung cancer) were determined by the Youden index (<xref ref-type="bibr" rid="B9">Fluss et al., 2005</xref>) enabling the selection of an optimal threshold value for classification, whereas the classification threshold of both M<sup>LOG</sup> and M<sup>LION</sup> for any sample is a fixed value (0.5). The ROC curves were generated based on the predicted probabilities of all models on the two samples (<xref ref-type="fig" rid="F3">Figure 3</xref>). The results demonstrated that despite the specificity of M<sup>LION</sup> being slightly lower than M<sup>CNN</sup>&#x2019;s on THCA sample, its accuracies, sensitivities, and AUCs were significantly better than the other models on both THCA and lung cancer samples, which indicated that M<sup>LION</sup> can accurately predict the samples.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>The performances of models on THCA and lung cancer samples.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th colspan="4" align="center">THCA (170) <xref ref-type="table-fn" rid="Tfn3">
<sup>a</sup>
</xref>
</th>
<th colspan="4" align="center">Lung (184)</th>
</tr>
<tr>
<th align="left"/>
<th align="center">
<inline-formula id="inf45">
<mml:math id="m54">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>T</mml:mtext>
<mml:mrow>
<mml:mtext>LOG</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> <xref ref-type="table-fn" rid="Tfn4">
<sup>b</sup>
</xref>
</th>
<th align="center">M<sup>CAT</sup>
</th>
<th align="center">M<sup>CNN</sup>
</th>
<th align="center">
<inline-formula id="inf46">
<mml:math id="m55">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>T</mml:mtext>
<mml:mrow>
<mml:mtext>LION</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf47">
<mml:math id="m56">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>LOG</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">M<sup>CAT</sup>
</th>
<th align="center">M<sup>CNN</sup>
</th>
<th align="center">
<inline-formula id="inf48">
<mml:math id="m57">
<mml:mrow>
<mml:msubsup>
<mml:mtext>M</mml:mtext>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>LION</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">ACC</td>
<td align="char" char=".">0.651</td>
<td align="char" char=".">0.693</td>
<td align="char" char=".">0.753</td>
<td align="char" char=".">0.872</td>
<td align="char" char=".">0.659</td>
<td align="char" char=".">0.698</td>
<td align="char" char=".">0.732</td>
<td align="char" char=".">0.818</td>
</tr>
<tr>
<td align="left">SEN</td>
<td align="char" char=".">0.444</td>
<td align="char" char=".">0.488</td>
<td align="char" char=".">0.418</td>
<td align="char" char=".">0.775</td>
<td align="char" char=".">0.552</td>
<td align="char" char=".">0.625</td>
<td align="char" char=".">0.538</td>
<td align="char" char=".">0.730</td>
</tr>
<tr>
<td align="left">SPE</td>
<td align="char" char=".">0.800</td>
<td align="char" char=".">0.827</td>
<td align="char" char=".">0.973</td>
<td align="char" char=".">0.957</td>
<td align="char" char=".">0.712</td>
<td align="char" char=".">0.750</td>
<td align="char" char=".">0.869</td>
<td align="char" char=".">0.882</td>
</tr>
<tr>
<td align="left">AUC</td>
<td align="char" char=".">0.600</td>
<td align="char" char=".">0.692</td>
<td align="char" char=".">0.724</td>
<td align="char" char=".">0.974</td>
<td align="char" char=".">0.680</td>
<td align="char" char=".">0.736</td>
<td align="char" char=".">0.753</td>
<td align="char" char=".">0.899</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>ACC, accuracy; SEN, sensitivity; SPE, specificity; AUC, area under the receiver operating characteristic curve; THCA, thyroid cancer.</p>
</fn>
<fn id="Tfn3">
<label>a</label>
<p>Each group of samples was a mix of cancer and control samples (n &#x3d; 260).</p>
</fn>
<fn id="Tfn4">
<label>b</label>
<p>The threshold of both M<sup>LOG</sup> and M<sup>LION</sup> was 0.5 for two samples and the thresholds of M<sup>CAT</sup> and M<sup>CNN</sup> were set by Youden index (M<sup>CAT</sup>: 0.336 for THCA and 0.321 for lung cancer; M<sup>CNN</sup>: 0.433 for THCA and 0.419 for lung cancer).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>The ROC curves of models on T<sub>2</sub>. <bold>(A)</bold> The ROC curves on THCA samples. The AUCs of M<sup>LOG</sup>, M<sup>CAT</sup>, M<sup>CNN</sup>, M<sup>LION</sup> are 0.60, 0.69, 0.72 and 0.97. <bold>(B)</bold> The ROC curves on lung cancer samples. The AUCs of M<sup>LOG</sup>, M<sup>CAT</sup>, M<sup>CNN</sup>, M<sup>LION</sup> are 0.68, 0.74, 0.75 and 0.90.</p>
</caption>
<graphic xlink:href="fgene-13-860510-g003.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>In this study, we developed a deep learning method combined with MIL, called DeepLION, to improve the prediction of caTCRs. Compared to some of the current studies that decomposed TCR&#x3b2; CDR3 sequences into <italic>z</italic>-mers in the data preprocessing, DeepLION was able to extract the features of the cancer-specific motifs with different lengths by the group of various convolution filters and the 1-max pooling operations in the CNN; the MIL part of DeepLION assigned adjusted weights for each TCR after learning the TCR correlations in the prediction process while the existing methods often ignored the correlations among TCRs in the same repertoire. We conducted two experiments on several cohorts of patients from nine cancer types to evaluate the performances of DeepLION. We observed that DeepLION achieved higher prediction accuracies, sensitivities, and AUCs on most of the cohorts than the existing methods, where the AUC reached notably 0.97 and 0.90 for THCA and lung cancer cohorts, respectively.</p>
<p>In <xref ref-type="sec" rid="s3-2">section 3.2</xref>, with the elaborate design of convolution filters and 1-max pooling operations, M<sup>CNN</sup> was able to make full use of the information of all TCR sequences in the repertoire (M<sup>CAT</sup> could only process sequences of length 12&#x2013;16) and extract the features of motifs of various lengths from TCRs to make more accurate TCR prediction, which resulted in the better performance on the test data (<xref ref-type="table" rid="T4">Table 4</xref>). However, on some samples, such as glioblastoma multiforme and lung cancer samples in T<sub>1</sub> and both samples in T<sub>2</sub>, both models performed poorly (low sensitivities and AUCs) (<xref ref-type="table" rid="T4">Table 4</xref>). This could be due to the ambiguity of the training sequence labels (not all cancer sequences for training were confirmed to be associated with cancer) and the simple definition of the repertoire cancer score, averaging all TCR scores as the repertoire cancer score, which didn&#x2019;t fully utilize TCR relationships or assign appropriate weights to them. Due to this definition, the models&#x2019; classification thresholds were unknown, which is a huge challenge for cancer prediction, despite the high AUCs on samples like melanoma and ovarian cancer samples in T<sub>1</sub>. In our experiments, the thresholds were set based on a fixed specificity to facilitate comparison between models, and the thresholds set at 0.9 specificities for the models were different on T<sub>1</sub> and T<sub>2</sub> (M<sup>CAT</sup>: 0.277 for T<sub>1</sub> and 0.351 for T<sub>2</sub>; M<sup>CNN</sup>: 0.392 for T<sub>1</sub> and 0.423 for T<sub>2</sub>) (<xref ref-type="table" rid="T4">Table 4</xref>), indicating that it is difficult for the models to predict precisely for datasets from different sources with a fixed classification threshold. Furthermore, the poor performance of models trained with TCGA data on both Asian samples in T<sub>2</sub> could be explained by possible differences in TCR repertoires between patients of different races.</p>
<p>In <xref ref-type="sec" rid="s3-3">section 3.3</xref>, M<sup>LOG</sup> performed the worst performance on the samples (<xref ref-type="table" rid="T5">Table 5</xref>), because it applied the inappropriate MIL assumption, using the maximum of TCR score as the repertoire cance score, and used the Atchley matrix (<xref ref-type="bibr" rid="B1">Atchley et al., 2005</xref>) to characterize AAs, which contained less biochemical information than the Beshnova matrix (<xref ref-type="bibr" rid="B2">Beshnova et al., 2020</xref>), and it couldn&#x2019;t extract the features of cancer-specific motifs with distinct lengths in the repertoire. Although the Youden index was used to define the classification thresholds for M<sup>CAT</sup> and M<sup>CNN</sup>, their accuracies and sensitivities on two samples were significantly lower than those of M<sup>LION</sup> due to the incorrect definition of the repertoire cancer score and the possible differences in the repertoires of patients from different races (<xref ref-type="table" rid="T5">Table 5</xref>). Different from other methods, M<sup>LION</sup> used MIL to learn the correlations among TCRs in the repertoire of the patients with the same cancer type, and assign the adjusted weights to each TCR when calculating the cancer scores of the repertoire. Therefore, M<sup>LION</sup> was capable of predicting the caTCRs and classifying the samples more accurately than existing methods.</p>
<p>When the amounts of training data are small, overfitting is a concern with deep learning models. To reduce overfitting, we simplified the model by using one-layer linear classifiers instead of multi-layer linear classifiers. During the training process, we applied random dropouts at a rate of 40% to each linear classifier and employed early stopping (<xref ref-type="bibr" rid="B31">Yao et al., 2007</xref>) to the model. Five-fold cross-validation was performed 10 times on THCA and lung cancer samples separately to assess model generalization (<xref ref-type="table" rid="T6">Table 6</xref>). Given that K-fold cross-validation produces significantly skewed performance estimates with small sample sizes, whereas nested cross-validation produces robust and unbiased performance estimates regardless of sample size (<xref ref-type="bibr" rid="B30">Wang et al., 2018</xref>; <xref ref-type="bibr" rid="B29">Vabalas et al., 2019</xref>), we also applied the nested five-fold cross-validation to both THCA and lung cancer samples and repeated it 10 times to ensure the robustness of our evaluation results (<xref ref-type="table" rid="T6">Table 6</xref>). In comparison to the results of the five-fold cross-validation, some of the metrics in the nested cross-validation results degraded to some extent, but the overall performances of our model were stable, and all the metrics were higher than those of the two existing methods (<xref ref-type="table" rid="T5">Table 5</xref>), indicating that our model had a high degree of generalizability.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>The performances of cross-validations on THCA and lung cancer samples.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th colspan="2" align="center">THCA (170) <xref ref-type="table-fn" rid="Tfn5">
<sup>a</sup>
</xref>
</th>
<th colspan="2" align="center">Lung (184)</th>
</tr>
<tr>
<th align="left"/>
<th align="center">K-fold</th>
<th align="center">Nested</th>
<th align="center">K-fold</th>
<th align="center">Nested</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">ACC</td>
<td align="center">0.843 &#xb1; 0.017 <xref ref-type="table-fn" rid="Tfn6">
<sup>b</sup>
</xref>
</td>
<td align="center">0.817 &#xb1; 0.010</td>
<td align="center">0.786 &#xb1; 0.020</td>
<td align="center">0.741 &#xb1; 0.010</td>
</tr>
<tr>
<td align="left">SEN</td>
<td align="center">0.773 &#xb1; 0.025</td>
<td align="center">0.706 &#xb1; 0.021</td>
<td align="center">0.697 &#xb1; 0.035</td>
<td align="center">0.679 &#xb1; 0.026</td>
</tr>
<tr>
<td align="left">SPE</td>
<td align="center">0.892 &#xb1; 0.035</td>
<td align="center">0.910 &#xb1; 0.016</td>
<td align="center">0.848 &#xb1; 0.025</td>
<td align="center">0.783 &#xb1; 0.019</td>
</tr>
<tr>
<td align="left">AUC</td>
<td align="center">0.925 &#xb1; 0.010</td>
<td align="center">0.909 &#xb1; 0.007</td>
<td align="center">0.841 &#xb1; 0.014</td>
<td align="center">0.806 &#xb1; 0.011</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>ACC, accuracy; SEN, sensitivity; SPE, specificity; AUC, area under the receiver operating characteristic curve; THCA, thyroid cancer.</p>
</fn>
<fn id="Tfn5">
<label>a</label>
<p>Each group of samples was a mix of cancer and control samples (n &#x3d; 260).</p>
</fn>
<fn id="Tfn6">
<label>b</label>
<p>The results show 95% confidence intervals for all the validations (totally 50 validations for each cross-validation).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The results of two cross-validations indicated that our model had higher specificities than sensitivities on both THCA and lung cancer (<xref ref-type="table" rid="T6">Table 6</xref>). We reasoned that this is likely because some caTCRs have sequence similarities with non-cancer TCRs, but their antigenic specificities differ due to their different spatial structures; there are many more non-cancer TCRs than caTCRs in a cancer-associated repertoire, which also brings additional difficulty to identifying the caTCRs based on the limited TCR-seq data. Furthermore, our model is rigorous in determining cancer samples and makes negative judgments on ambiguous samples. As a result, the model is more prone to making mistakes when it comes to predicting cancer samples. Likewise, the existing methods also had lower sensitivities than specificities (<xref ref-type="table" rid="T5">Table 5</xref>). But DeepLION outperformed them in cancer sample prediction due to the unique architecture of CNN and the MIL part.</p>
<p>DeepLION has few adjustable hyperparameters except for the number of TCRs extracted from one repertoire <italic>k</italic> and the learning rate, which is easily applied. The computational complexity of the model is lower when <italic>k</italic> is smaller, but the information of TCRs used by the model is less, which could reduce the model performance. Thus, <italic>k</italic> should be as small as possible while ensuring satisfactory model performance. Because we observed that the model performance was unsatisfactory when <italic>k</italic> &#x3c; 100 through experiments, we finally set <italic>k</italic> to 100 in our experiments. In addition, the abundance of TCRs affects the predictions of the model. We discovered that the model performance deteriorated if we randomly extracted the 100 TCRs instead of the top 100 TCRs. Because the learning rate is usually set to 0.001, we used this value as well.</p>
<p>In future work, we will apply DeepLION to other cancer types. And we plan to improve our method to make it able to extract the shared correlations among TCRs of patients with various cancers so that the trained model can be applied to detect various cancers accurately. In this method, we will utilize TCR&#x3b1; sequences as well as TCR&#x3b2; sequences for analysis and explore a better encoding operation to characterize TCRs.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<title>Conclusion</title>
<p>DeepLION introduces a deep MIL framework to consider the various length of the cancer-associated motifs and the correlations among TCRs, achieving a higher prediction accuracy in cancer detection from TCR repertoire data than the current state-of-the-art methods. Thus, DeepLION has the potential to support cancer detection from TCR repertoire data.</p>
</sec>
</body>
<back>
<sec id="s6">
<title>Data Availability Statement</title>
<p>The source codes and the software tool, DeepLION, is available on GitHub, at <ext-link ext-link-type="uri" xlink:href="https://github.com/Bioinformatics7181/DeepLION">https://github.com/Bioinformatics7181/DeepLION</ext-link>, for academic usage only. The publicly available TCR-sequencing data for this study can be found in Adaptive Biotechnologies immuneACCESS online database, at <ext-link ext-link-type="uri" xlink:href="https://clients.adaptivebiotech.com/immuneaccess">https://clients.adaptivebiotech.com/immuneaccess</ext-link>. The thyroid cancer TCR-sequencing samples were derived from Lan et al. (<xref ref-type="bibr" rid="B17">Lan et al., 2020</xref>), which can be found in NCBI, at <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/bioproject/PRJNA642967">https://www.ncbi.nlm.nih.gov/bioproject/PRJNA642967</ext-link>, whereas the lung cancer TCR-sequencing samples were derived from Li et al. (<xref ref-type="bibr" rid="B21">Li et al., 2021</xref>). The training data of DeepCAT and the trained DeepCAT model can be found on Github, at <ext-link ext-link-type="uri" xlink:href="https://github.com/s175573/DeepCAT">https://github.com/s175573/DeepCAT</ext-link>.</p>
</sec>
<sec id="s7">
<title>Author Contributions</title>
<p>JW, YX, and XQ conceived and designed the experiments; XQ performed the experiments; YX and XQ analyzed the data; YX contributed materials; JW, YX, and XQ wrote the paper. All authors have read and agreed to the published version of the manuscript.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>This research was funded by the Natural Science Basic Research Program of Shaanxi, grant number 2020JC-01. The article processing charge was funded by the Natural Science Basic Research Program of Shaanxi, grant number 2020JC-01.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Abbreviations</title>
<p>AA, Amino acid; ACC, Accuracy; AUC, Area under the receiver operating characteristic curve; AVG, Average; BRCA, Breast cancer; caTCR, Cancer-associated T cell receptor; CDR3, Comple-mentarity determining region 3; CMV, Cytomegalovirus; CNN, Convolutional neural network; CRC, Colorectal cancer; GBM, Glioblastoma multiforme; IA, Adaptive Biotechnologies immuneACCESS online database; IR-seq, immune repertoire sequencing; MHC, Major histocompatibility complex; MIL, Multi-instance learning; PBMC, Peripheral blood mononuclear cell; PCA, principal component analysis; ReLU, Rectified linear unit; RNA-seq, Ribonucleic acid-sequencing; ROC, Receiver operating characteristic; SEN, Sensitivity; SPE, Specificity; TCGA, The Cancer Genome Atlas; TCR, T cell receptor; TCR-seq, T cell receptor-sequencing; THCA, Thyroid cancer; TIL, Tumor-infiltrating T lymphocyte.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Atchley</surname>
<given-names>W. R.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fernandes</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Dr&#xfc;ke</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Solving the Protein Sequence Metric Problem</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>102</volume>, <fpage>6395</fpage>&#x2013;<lpage>6400</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0408677102</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beshnova</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Onabolu</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Moon</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Y.-X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>De Novo prediction of Cancer-Associated T Cell Receptors for Noninvasive Cancer Detection</article-title>. <source>Sci. Transl. Med.</source> <volume>12</volume>, <fpage>eaaz3738</fpage>. <pub-id pub-id-type="doi">10.1126/scitranslmed.aaz3738</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chowell</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Krishna</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Becker</surname>
<given-names>P. D.</given-names>
</name>
<name>
<surname>Cocita</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Shu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>TCR Contact Residue Hydrophobicity Is a Hallmark of Immunogenic CD8&#x2b; T Cell Epitopes</article-title>. <source>Proc. Natl. Acad. Sci. USA.</source> <volume>112</volume>, <fpage>E1754</fpage>&#x2013;<lpage>E1762</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1500973112</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cinelli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Best</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Heather</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Reich-Zeliger</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shifrut</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Feature Selection Using a One Dimensional Na&#xef;ve Bayes&#x27; Classifier Increases the Accuracy of Support Vector Machine Classification of CDR3 Repertoires</article-title>. <source>Bioinformatics.</source> <volume>33</volume>, <fpage>btw771</fpage>&#x2013;<lpage>955</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btw771</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Coulie</surname>
<given-names>P. G.</given-names>
</name>
<name>
<surname>Van Den Eynde</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Van Der Bruggen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Boon</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Tumour Antigens Recognized by T Lymphocytes: At the Core of Cancer Immunotherapy</article-title>. <source>Nat. Rev. Cancer.</source> <volume>14</volume>, <fpage>135</fpage>&#x2013;<lpage>146</lpage>. <pub-id pub-id-type="doi">10.1038/nrc3670</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhodapkar</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Dhodapkar</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Harnessing Shared Antigens and T-Cell Receptors in Cancer: Opportunities and Challenges</article-title>. <source>Proc. Natl. Acad. Sci. USA.</source> <volume>113</volume>, <fpage>7944</fpage>&#x2013;<lpage>7945</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1608860113</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dietterich</surname>
<given-names>T. G.</given-names>
</name>
<name>
<surname>Lathrop</surname>
<given-names>R. H.</given-names>
</name>
<name>
<surname>Lozano-P&#xe9;rez</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Solving the Multiple Instance Problem with axis-parallel Rectangles</article-title>. <source>Artif. Intelligence.</source> <volume>89</volume>, <fpage>31</fpage>&#x2013;<lpage>71</lpage>. <pub-id pub-id-type="doi">10.1016/s0004-3702(96)00034-3</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Emerson</surname>
<given-names>R. O.</given-names>
</name>
<name>
<surname>DeWitt</surname>
<given-names>W. S.</given-names>
</name>
<name>
<surname>Vignali</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gravley</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J. K.</given-names>
</name>
<name>
<surname>Osborne</surname>
<given-names>E. J.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Immunosequencing Identifies Signatures of Cytomegalovirus Exposure History and HLA-Mediated Effects on the T Cell Repertoire</article-title>. <source>Nat. Genet.</source> <volume>49</volume>, <fpage>659</fpage>&#x2013;<lpage>665</lpage>. <pub-id pub-id-type="doi">10.1038/ng.3822</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fluss</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Faraggi</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Reiser</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Estimation of the Youden Index and its Associated Cutoff point</article-title>. <source>Biom. J.</source> <volume>47</volume>, <fpage>458</fpage>&#x2013;<lpage>472</lpage>. <pub-id pub-id-type="doi">10.1002/bimj.200410135</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Foulds</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Frank</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>A Review of Multi-Instance Learning Assumptions</article-title>. <source>Knowledge Eng. Rev.</source> <volume>25</volume>, <fpage>1</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1017/S026988890999035X</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Glorot</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Understanding the Difficulty of Training Deep Feedforward Neural Networks</article-title>. <source>Proc. thir-teenth Int. Conf. Artif. intelligence Stat.</source> <volume>9</volume>, <fpage>249</fpage>&#x2013;<lpage>256</lpage>. </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gubin</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Schuster</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Caron</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ward</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Noguchi</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Checkpoint Blockade Cancer Immunotherapy Targets Tumour-specific Mutant Antigens</article-title>. <source>Nature</source> <volume>515</volume>, <fpage>577</fpage>&#x2013;<lpage>581</lpage>. <pub-id pub-id-type="doi">10.1038/nature13988</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kawashima</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kanehisa</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>AAindex: Amino Acid index Database</article-title>. <source>Nucleic Acids Res.</source> <volume>28</volume>, <fpage>374</fpage>. <pub-id pub-id-type="doi">10.1093/nar/28.1.374</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kidera</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Konishi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Oka</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ooi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Scheraga</surname>
<given-names>H. A.</given-names>
</name>
</person-group> (<year>1985</year>). <article-title>Statistical Analysis of the Physical Properties of the 20 Naturally Occurring Amino Acids</article-title>. <source>J. Protein Chem.</source> <volume>4</volume>, <fpage>23</fpage>&#x2013;<lpage>55</lpage>. <pub-id pub-id-type="doi">10.1007/BF01025492</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kirsch</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Vignali</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Robins</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>T-cell Receptor Profiling in Cancer</article-title>. <source>Mol. Oncol.</source> <volume>9</volume>, <fpage>2063</fpage>&#x2013;<lpage>2070</lpage>. <pub-id pub-id-type="doi">10.1016/j.molonc.2015.09.003</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kvistborg</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>van Buuren</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Schumacher</surname>
<given-names>T. N.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Human Cancer Regression Antigens</article-title>. <source>Curr. Opin. Immunol.</source> <volume>25</volume>, <fpage>284</fpage>&#x2013;<lpage>290</lpage>. <pub-id pub-id-type="doi">10.1016/j.coi.2013.03.005</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>TCR-seq Identifies Distinct Repertoires of Distant-Metastatic and Nondistant-Metastatic Thyroid Tumors</article-title>. <source>J. Clin. Endocrinol. Metab.</source> <volume>105</volume> (<issue>9</issue>), <fpage>3036</fpage>&#x2013;<lpage>3045</lpage>. <pub-id pub-id-type="doi">10.1210/clinem/dgaa452</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lefranc</surname>
<given-names>M.-P.</given-names>
</name>
<name>
<surname>Giudicelli</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Duroux</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jabado-Michaloud</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Folch</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Aouinti</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>IMGT, the International ImMunoGeneTics Information System 25 Years on</article-title>. <source>Nucleic Acids Res.</source> <volume>43</volume>, <fpage>D413</fpage>&#x2013;<lpage>D422</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gku1056</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Pignon</surname>
<given-names>J.-C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shukla</surname>
<given-names>S. A.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Landscape of Tumor-Infiltrating T Cell Repertoire of Human Cancers</article-title>. <source>Nat. Genet.</source> <volume>48</volume>, <fpage>725</fpage>&#x2013;<lpage>732</lpage>. <pub-id pub-id-type="doi">10.1038/ng.3581</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Dou</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J. S.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Ultrasensitive Detection of TCR Hypervariable-Region Sequences in Solid-Tissue RNA-Seq Data</article-title>. <source>Nat. Genet.</source> <volume>49</volume>, <fpage>482</fpage>&#x2013;<lpage>483</lpage>. <pub-id pub-id-type="doi">10.1038/ng.3820</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Lung Cancer-Associated T Cell Repertoire as Potential Biomarker for Early Detection of Stage I Lung Cancer</article-title>. <source>Lung Cancer</source> <volume>162</volume>, <fpage>16</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1016/j.lungcan.2021.09.017</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ostmeyer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Christley</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Toby</surname>
<given-names>I. T.</given-names>
</name>
<name>
<surname>Cowell</surname>
<given-names>L. G.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Biophysicochemical Motifs in T-Cell Receptor Sequences Distinguish Repertoires from Tumor-Infiltrating Lymphocyte and Adjacent Healthy Tissue</article-title>. <source>Cancer Res.</source> <volume>79</volume>, <fpage>1671</fpage>&#x2013;<lpage>1680</lpage>. <pub-id pub-id-type="doi">10.1158/0008-5472.CAN-18-2292</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sahasrabudhe</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sujobert</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zacharaki</surname>
<given-names>E. I.</given-names>
</name>
<name>
<surname>Maurin</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Grange</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Jallades</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Deep Multi-Instance Learning Using Multi-Modal Data for Diagnosis of Lymphocytosis</article-title>. <source>IEEE J. Biomed. Health Inform.</source> <volume>25</volume>, <fpage>2125</fpage>&#x2013;<lpage>2136</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2020.3038889</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schreiber</surname>
<given-names>R. D.</given-names>
</name>
<name>
<surname>Old</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Smyth</surname>
<given-names>M. J.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Cancer Immunoediting: Integrating Immunity&#x27;s Roles in Cancer Suppression and Promotion</article-title>. <source>Science</source> <volume>331</volume>, <fpage>1565</fpage>&#x2013;<lpage>1570</lpage>. <pub-id pub-id-type="doi">10.1126/science.1203486</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Best</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Cinelli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Heather</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Reich-Zeliger</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shifrut</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Specificity, Privacy, and Degeneracy in the CD4 T Cell Receptor Repertoire Following Immunization</article-title>. <source>Front. Immunol.</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.3389/fimmu.2017.00430</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tomczak</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Czerwi&#x144;ska</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wiznerowicz</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Review the Cancer Genome Atlas (TCGA): an Immeasurable Source of Knowledge</article-title>. <source>Wspolczesna Onkol</source> <volume>1A</volume>, <fpage>68</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.5114/wo.2014.47136</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tran</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Turcotte</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gros</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Robbins</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y.-C.</given-names>
</name>
<name>
<surname>Dudley</surname>
<given-names>M. E.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Cancer Immunotherapy Based on Mutation-specific CD4&#x2b; T Cells in a Patient with Epithelial Cancer</article-title>. <source>Science</source> <volume>344</volume>, <fpage>641</fpage>&#x2013;<lpage>645</lpage>. <pub-id pub-id-type="doi">10.1126/science.1251102</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tumeh</surname>
<given-names>P. C.</given-names>
</name>
<name>
<surname>Harview</surname>
<given-names>C. L.</given-names>
</name>
<name>
<surname>Yearley</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Shintaku</surname>
<given-names>I. P.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>E. J. M.</given-names>
</name>
<name>
<surname>Robert</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>PD-1 Blockade Induces Responses by Inhibiting Adaptive Immune Resistance</article-title>. <source>Nature</source> <volume>515</volume>, <fpage>568</fpage>&#x2013;<lpage>571</lpage>. <pub-id pub-id-type="doi">10.1038/nature13954</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vabalas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gowen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Poliakoff</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Casson</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Machine Learning Algorithm Validation with a Limited Sample Size</article-title>. <source>PLoS One</source> <volume>14</volume> (<issue>11</issue>), <fpage>e0224365</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0224365</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.-H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>T.-Y.</given-names>
</name>
<name>
<surname>Horng</surname>
<given-names>J.-T.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>T.-P.</given-names>
</name>
<name>
<surname>Tseng</surname>
<given-names>Y.-J.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Rapid Detection of Heterogeneous Vancomycin-Intermediate <italic>Staphylococcus aureus</italic> Based on Matrix-Assisted Laser Desorption Ionization Time-Of-Flight: Using a Machine Learning Approach and Unbiased Validation</article-title>. <source>Front. Microbiol.</source> <volume>9</volume>, <fpage>2393</fpage>. <pub-id pub-id-type="doi">10.3389/fmicb.2018.02393</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Rosasco</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Caponnetto</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>On Early Stopping in Gradient Descent Learning</article-title>. <source>Constr. Approx.</source> <volume>26</volume>, <fpage>289</fpage>&#x2013;<lpage>315</lpage>. <pub-id pub-id-type="doi">10.1007/s00365-006-0663-2</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yokota</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Kaminaga</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kobayashi</surname>
<given-names>T. J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Quantification of Inter-sample Differences in T-Cell Receptor Repertoires Using Sequence-Based Information</article-title>. <source>Front. Immunol.</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.3389/fimmu.2017.01500</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shukla</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Investigation of Antigen-specific T-Cell Receptor Clusters in Human Cancers</article-title>. <source>Clin. Cancer Res.</source> <volume>26</volume>, <fpage>1359</fpage>&#x2013;<lpage>1371</lpage>. <pub-id pub-id-type="doi">10.1158/1078-0432.CCR-19-3249</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wallace</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>A Sensitivity Analysis of (And Practitioners&#x2019; Guide to) Convolutional Neural Networks for Sentence Classification</article-title>. <source>arXiv preprint</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1510.03820</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>