<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1082168</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2023.1082168</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Estimating tissue-specific peptide abundance from public RNA-Seq data</article-title>
<alt-title alt-title-type="left-running-head">Frentzen et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2023.1082168">10.3389/fgene.2023.1082168</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Frentzen</surname>
<given-names>Angela</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2072899/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Greenbaum</surname>
<given-names>Jason A.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1227370/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kim</surname>
<given-names>Haeuk</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2072950/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Peters</surname>
<given-names>Bjoern</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/487574/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Ko&#x15f;alo&#x11f;lu-Yal&#xe7;&#x131;n</surname>
<given-names>Zeynep</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1191697/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Center for Infectious Disease and Vaccine Research</institution>, <institution>La Jolla Institute for Immunology</institution>, <addr-line>San Diego</addr-line>, <addr-line>CA</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Medicine</institution>, <institution>University of California, San Diego</institution>, <addr-line>San Diego</addr-line>, <addr-line>CA</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/777337/overview">Alfonso Maurizio Urso</ext-link>, National Research Council (CNR), Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1161017/overview">Gee Jun Tye</ext-link>, University of Science Malaysia (USM), Malaysia</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/227498/overview">Barbara Schr&#xf6;rs</ext-link>, Translationale Onkologie an der Universit&#xe4;tsmedizin der Johannes Gutenberg-Universit&#xe4;t Mainz, Germany</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Zeynep Ko&#x15f;alo&#x11f;lu-Yal&#xe7;&#x131;n, <email>zeynep@lji.org</email>
</corresp>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this work</p>
</fn>
<fn fn-type="other">
<p>This article was submitted to Computational Genomics, a section of the journal Frontiers in Genetics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>12</day>
<month>01</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1082168</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>10</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>01</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Frentzen, Greenbaum, Kim, Peters and Ko&#x15f;alo&#x11f;lu-Yal&#xe7;&#x131;n.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Frentzen, Greenbaum, Kim, Peters and Ko&#x15f;alo&#x11f;lu-Yal&#xe7;&#x131;n</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Several novel MHC class I epitope prediction tools additionally incorporate the abundance levels of the peptides&#x2019; source antigens and have shown improved performance for predicting immunogenicity. Such tools require the user to input the MHC alleles and peptide sequences of interest, as well as the abundance levels of the peptides&#x2019; source proteins. However, such expression data is often not directly available to users, and retrieving the expression level of a peptide&#x2019;s source antigen from public databases is not trivial. We have developed the Peptide eXpression annotator (pepX), which takes a peptide as input, identifies from which proteins the peptide can be derived, and returns an estimate of the expression level of those source proteins from selected public databases. We have also investigated how the abundance level of a peptide can be best estimated in cases when it can originate from multiple transcripts and proteins and found that summing up transcript-level expression values performs best in distinguishing ligands from decoy peptides.</p>
</abstract>
<kwd-group>
<kwd>RNA-Seq</kwd>
<kwd>RNA sequencing</kwd>
<kwd>peptide (pep)</kwd>
<kwd>prediction</kwd>
<kwd>ligands</kwd>
<kwd>tool</kwd>
<kwd>cancer</kwd>
</kwd-group>
<contract-sponsor id="cn001">National Cancer Institute<named-content content-type="fundref-id">10.13039/100000054</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">Division of Intramural Research, National Institute of Allergy and Infectious Diseases<named-content content-type="fundref-id">10.13039/100006492</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Presentation of peptides on the cell surface by major histocompatibility complex (MHC) class I molecules is crucial for CD8<sup>&#x2b;</sup> T cell-mediated immune responses, including those against viral infections and tumors. Receptors on the surface of T cells (TCRs) scan MHC-bound peptides, and peptides that lead to T Cell activation and proliferation are referred to as T Cell epitopes. Numerous computational tools have been developed to predict which peptides will bind to MHC molecules and likely be recognized as epitopes (<xref ref-type="bibr" rid="B14">Peters et al., 2020</xref>). Several novel MHC class I epitope prediction tools additionally incorporate the abundance levels of the peptides&#x2019; source antigens and have shown improved performance (<xref ref-type="bibr" rid="B1">Abelin et al., 2017</xref>; <xref ref-type="bibr" rid="B17">Sarkizova et al., 2020</xref>; <xref ref-type="bibr" rid="B6">Garcia Alvarez et al., 2022</xref>; <xref ref-type="bibr" rid="B10">Kosaloglu-Yalcin et al., 2022</xref>) for predicting immunogenicity. Such tools require the user to input the MHC alleles and peptide sequences of interest, as well as the abundance levels of the peptides&#x2019; source proteins. However, such expression data is often not directly available to users, and retrieving the expression level of a peptide&#x2019;s source antigen from public databases is not trivial. First, it needs to be determined from which protein(s) the peptide of interest can be derived. Then, the expression values of those proteins need to be fetched from public expression datasets, and data have to be aggregated to account for variability between different individuals and the availability of the same peptide from multiple transcript variants and/or multiple genes.</p>
<p>To address these issues, we have developed the Peptide eXpression annotator (pepX), which takes a peptide as input, identifies from which proteins the peptide can be derived, and returns an estimate of the expression level of those source proteins. In this study, we have also investigated how the abundance level of a peptide can be best estimated in cases when it can originate from multiple source antigens. RNA-Seq gene and transcript expression quantification can be calculated, for example, as FPKM (fragments per kilobase of transcript per million fragments mapped), RPKM (reads per kilobase of exon per million reads mapped), or TPM (transcripts per million). In this study, we chose to use TPM to quantify gene expression. TPM values can be calculated on the transcript level by counting the RNA-Seq reads covering each transcript sequence. TPM values can also be calculated on the gene level by counting RNA-Seq reads covering each transcript encoded by a gene. Here, we provide insights into the differences between using gene-level and transcript-level TPM values for estimating peptide abundances.</p>
<p>We utilize expression data from several public databases, including The Cancer Genome Atlas (TCGA) (<xref ref-type="bibr" rid="B3">Cancer Genome Atlas Research et al., 2013</xref>), Genotype-Tissue Expression (GTEx) (<xref ref-type="bibr" rid="B5">Carithers and Moore, 2015</xref>), Cancer Cell Line Encyclopedia (CCLE) (<xref ref-type="bibr" rid="B7">Ghandi et al., 2019</xref>), and the Human Protein Atlas (HPA) (<xref ref-type="bibr" rid="B21">Uhlen et al., 2010</xref>). pepX is freely available as a web-based resource at <ext-link ext-link-type="uri" xlink:href="http://tools.iedb.org/pepx">http://tools.iedb.org/pepx</ext-link>.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and methods</title>
<sec id="s2-1">
<title>RNA-Seq datasets</title>
<p>The Riaz and Hugo bulk RNA datasets used in this study are available under BioProject accession numbers PRJNA356761 (<xref ref-type="bibr" rid="B16">Riaz et al., 2017</xref>) and PRJNA312948 (<xref ref-type="bibr" rid="B9">Hugo et al., 2016</xref>), respectively. Raw RNA-Seq reads were downloaded and processed using an in-house RNA-Seq mapping and analysis pipeline to calculate gene-level TPM values.</p>
</sec>
<sec id="s2-2">
<title>Expression datasets</title>
<p>Pre-calculated gene-level and transcript-level TPM values for the TCGA Pan-cancer cohort for 33 cancer types were downloaded from the UCSC Xena data pages (<xref ref-type="bibr" rid="B8">Goldman et al., 2020</xref>).</p>
<p>Pre-calculated gene-level and transcript-level TPM values for 256 healthy tissues were downloaded from the Human Protein Atlas (HPA) (<xref ref-type="bibr" rid="B21">Uhlen et al., 2010</xref>).</p>
<p>Pre-calculated gene-level and transcript-level TPM values for 54 healthy tissue subtypes were downloaded from The Genotype-Tissue Expression (GTEx) project data portal (<xref ref-type="bibr" rid="B5">Carithers and Moore, 2015</xref>). Median TPM values were calculated for each of the 31 main tissue types.</p>
<p>Pre-calculated gene-level and transcript-level TPM values for 1,019 cell lines were downloaded from the Cancer Cell Line Encyclopedia (CCLE) (<xref ref-type="bibr" rid="B7">Ghandi et al., 2019</xref>).</p>
<p>All datasets were downloaded in July 2022.</p>
</sec>
<sec id="s2-3">
<title>MHC class I ligand elution datasets</title>
<p>The Trolle dataset consisted of 15,524 non-redundant HLA class I ligands eluted from mono-allelic HeLa cells transfected with five different HLA class I alleles (<xref ref-type="bibr" rid="B20">Trolle et al., 2016</xref>). This dataset was downloaded from the IEDB (<xref ref-type="bibr" rid="B23">Vita et al., 2019</xref>) under the accession number 1000685 (<ext-link ext-link-type="uri" xlink:href="http://www.iedb.org/subID/1000685">http://www.iedb.org/subID/1000685</ext-link>).</p>
<p>The Abelin dataset contained 22,310 non-redundant eluted ligands from mono-allelic B721.221 cells transfected with 16 different HLA class I alleles. The dataset was retrieved from the supplementary materials of the original publication (<xref ref-type="bibr" rid="B1">Abelin et al., 2017</xref>). Abelin et al. also provided matched RNA-Seq data for four replicates under BioProject accession number PRJNA360601. Raw RNA-Seq reads were downloaded and processed using an in-house RNA-Seq mapping and analysis pipeline to calculate gene-level TPM values. Median TPM values of the four replicates were used.</p>
<p>The HLA Ligand Atlas consisted of tissue-specific HLA ligands from 23 healthy tissue types (<xref ref-type="bibr" rid="B13">Marcu et al., 2021</xref>). The dataset contained 223,246 non-redundant peptides and 675,346 peptide tissue pairs. We downloaded the data from the HLA Ligand Atlas data pages (downloaded in September 2022).</p>
<p>The Shinkawa dataset contained 2,352 non-redundant HLA class I eluted ligands from a HCT15/&#x3b2;2 cell line. The dataset was retrieved from the supplementary materials of the original publication (<xref ref-type="bibr" rid="B18">Shinkawa et al., 2021</xref>).</p>
<p>The Pyke dataset contained 34,090 ligands eluted from mono-allelic K562 cell lines transfected with 25 different HLA alleles. The Pyke Cancer dataset contained 31,660 ligands eluted from 12 tissue samples of colorectal and lung cancer patients. Both datasets were retrieved from the <xref ref-type="sec" rid="s10">Supplementary Material S1</xref> of the original publication (<xref ref-type="bibr" rid="B15">Pyke et al., 2021</xref>).</p>
<p>The Sarkizova dataset contained 140,918 eluted ligands from B721.221 cells transfected with 79 different HLA class I alleles. The dataset was retrieved from the supplementary materials of the original publication (<xref ref-type="bibr" rid="B17">Sarkizova et al., 2020</xref>).</p>
</sec>
<sec id="s2-4">
<title>PepX</title>
<p>The backend of pepX is a PostgretSQL database that is populated with all possible 8-15mers from the human proteome linked to TPM data from several publicly available databases.</p>
<p>A partial entity-relation diagram of the core tables in the pepX database is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The proteome tables of the database were populated by retrieving all possible 8-15mers from the GRCh38 Ensembl proteome (release-106). All peptides are linked to their associated protein sequences, as well as gene, transcript, and protein identifiers. The &#x2018;gene&#x2019; table includes the unique Ensembl gene (ENSG) identifiers, their gene symbols, and the number of proteins encoded by the gene. The &#x2018;gene2tx2protein&#x2019; table contains one row per protein/transcript and maps the Ensembl gene identifier to their corresponding protein (ENSP) and transcript (ENST) IDs. This table also includes the full protein sequence. The &#x2018;peptide2protein&#x2019; table maps each 8-15mer to their corresponding Ensembl protein ids, keeping note of the zero-indexed start position of the kmer within the full protein sequence. This schema allows pepX to quickly lookup which genes and transcripts are linked to a given peptide.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>pepX Database Schema. The three proteome-related tables of the database (in blue) catalog peptides, proteins, transcripts and genes. The three expression-related tables of the database (in green) track gene- and transcript-level TPM data and associated study details.</p>
</caption>
<graphic xlink:href="fgene-14-1082168-g001.tif"/>
</fig>
<p>We included expression data from six public databases in pepX, namely HPA, GTEx, TCGA, CCLE, and RNA-Seq data of a B721.221 cell line (<xref ref-type="bibr" rid="B1">Abelin et al., 2017</xref>) as well as a HeLa cell line (<xref ref-type="bibr" rid="B4">Cantarella et al., 2019</xref>). Each has several subtypes, which are cancer types in the case of TCGA (e.g. BRCA, COAD, PAAD), tissue-types in the case of HPA and GTEx (e.g. Skin, Stomach, Thyroid Gland), and cell-lines in the case of CCLE (e.g. HELA_CERVIX, HCC56_LARGE_INTESTINE). These subtypes and their external dataset source are listed in the &#x2018;expression_dataset&#x2019; table, each with Boolean values to indicate if gene- and transcript-level data are available (<xref ref-type="fig" rid="F1">Figure 1</xref>). All entries in the &#x2018;expression_dataset&#x2019; table contain a unique &#x2018;dataset_id&#x2019; that is used to map TPM values in the &#x2018;gene_TPM&#x2019; and &#x2018;transcript_TPM&#x2019; tables back to the associated dataset. The &#x2018;gene_TPM&#x2019; table maps TPM&#x2019;s from a given subtype to an Ensembl gene id, while the &#x2018;transcript_TPM&#x2019; table maps TPMs to the Ensembl transcript id. This structure allows pepX to quickly grab all TPMs linked to a given gene or transcript for a given study.</p>
<p>Several views have been created that efficiently perform the joins necessary to provide peptide-level and gene/transcript-level TPM output, given a list of peptides and dataset ID as input. In addition to the raw TPMs, scaled TPMs are calculated for each of the source proteins as:<disp-formula id="equ1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo>&#x23;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x23;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>For each peptide, the total TPM, median TPM, and maximum TPM are also provided.</p>
</sec>
<sec id="s2-5">
<title>Statistical analysis</title>
<p>R/Bioconductor was used for all statistical analyses. The following significance levels were used in all figures: ns: <italic>p</italic> &#x3e; .05, &#x2a;: p&#x2264;.05, &#x2a;&#x2a;: p&#x2264;.01, &#x2a;&#x2a;&#x2a;: p&#x2264;.001, &#x2a;&#x2a;&#x2a;&#x2a;: p&#x2264;.0001. All statistical tests are paired Wilcoxon tests, unless otherwise indicated.</p>
</sec>
<sec id="s2-6">
<title>RNA-Seq mapping pipeline</title>
<p>Reads mapping to tRNA, rRNA, adapter sequences, and spike-in controls were filtered with Bowtie 2 (v2.1.0). Remaining reads were mapped go the GRCh38 reference genome with Gencode v27 annotations using STAR (v2.6.1). Low complexity reads (DUST &#x3e;4) were removed from the BAMs with PRINSEQ Lite (v0.20.3) before counting reads with FeatureCount (v1.6.5).</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>Gene expression data from public databases correlate well with individual patient-derived RNA-Seq data</title>
<p>The TCGA program sequenced thousands of tumor and matched normal samples spanning 33 cancer types. For each cancer type, the number of patients analyzed varies, from 1,211 patients with breast cancer (TCGA-BRCA) to 45 patients with Cholangiocarcinoma (TCGA-CHOL, <xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>). This means when looking for the expression of a gene of interest in a specific cancer type, there are several expression values from different patients which need to be aggregated and summarized to derive one expression value per gene.</p>
<p>We sought to investigate what the best method is to aggregate TPM values from different patients and to determine how well the RNA-Seq data from TCGA correlates with patient-matched RNA-Seq. We obtained patient-matched RNA-Seq data from two melanoma studies published by Hugo et al. (<xref ref-type="bibr" rid="B9">Hugo et al., 2016</xref>) and Riaz et al. (<xref ref-type="bibr" rid="B16">Riaz et al., 2017</xref>) and compared the TPM values to the TCGA skin cancer samples (TCGA-SKCM). We aggregated the data over the 470 TCGA-SKCM patients by calculating the mean, median, and geometric mean TPM for each gene. Given the statistical background of how these metrics are calculated, it is expected that the values can significantly vary. The TPM values for PD-1 (PDCD1), for example, range between 0 and 60 in the TCGA-SKCM cohort (<xref ref-type="fig" rid="F2">Figure 2A</xref>). The mean is 4.5, the median is 1.6, and the geometric mean is 1.3. For each patient in the Hugo and Riaz datasets, we considered all genes and calculated how well the TPMs correlate to the TCGA-SKCM mean, median, and geometric mean. We found that the values of the three metrics significantly vary from each other (Kruskal-Wallis test, <italic>p</italic> &#x3c; .0001) and that the median TCGA-SKCM correlates best with patient-specific TPM values in both datasets (Wilcoxon test, <italic>p</italic> &#x3c; .001, <xref ref-type="fig" rid="F2">Figure 2B</xref>).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Expression data from TCGA correlates well with patient-specific RNA-Seq data. <bold>(A)</bold>. The mean, median, and geomean TPM were calculated for the gene PDCD1 for the 470 TCGA-SKCM patients (each dot represents one patient). <bold>(B)</bold>. For each patient in the Hugo (<xref ref-type="bibr" rid="B9">Hugo et al., 2016</xref>) and Riaz (<xref ref-type="bibr" rid="B16">Riaz et al., 2017</xref>) datasets, all genes were considered and TPM values were correlated to the TCGA-SKCM mean, median, and geomean TPM. Spearman correlations coefficients were calculated. <bold>(C)</bold>. For each patient, the TPM values were separated into ranges for both the patient-specific (<italic>x</italic>-axis) and the TCGA median (<italic>y</italic>-axis) TPM values. For each TPM range combination, the fraction of genes expressed within the corresponding TPM ranges is shown as a percentage and is also color-coded.</p>
</caption>
<graphic xlink:href="fgene-14-1082168-g002.tif"/>
</fig>
<p>To determine the overall correlation between the patient-specific TPM values and the TCGA-SKCM median TPM values, we combined the Hugo and Riaz datasets and found a significant correlation (Spearman correlation coefficient r &#x3d; .82). To get a better overview of the correlation, we separated the TPM values for each patient into ranges for both the patient-specific and the TCGA median TPM values to generate a 2-dimensional matrix. We then analyzed each TPM range combination and calculated the fraction of genes expressed within the corresponding TPM ranges (<xref ref-type="fig" rid="F2">Figure 2C</xref>). We found that 67% of genes are expressed at low levels in both the Hugo and Riaz cohort and in the TCGA-SKCM cohort, with TPM values of &#x3c;1. The majority of genes are expressed at similar levels in both cohorts, as demonstrated by an enrichment of genes on the diagonal in <xref ref-type="fig" rid="F2">Figure 2C</xref>. We observed similar results when we analyzed a smaller set of in-house patients with six different cancer types (<xref ref-type="sec" rid="s10">Supplementary Figure S2</xref>).</p>
<p>Taken together, these findings show that TPM values from a public database like TCGA are suitable for estimating gene expression in a patient sample if patient-specific RNA-Seq is not available.</p>
</sec>
<sec id="s3-2">
<title>Retrieving peptide abundance from public databases and aggregating expression levels from different source antigens</title>
<p>We developed pepX, a tool for estimating a peptide&#x2019;s expression level based upon the source antigen(s) in which it is contained. pepX takes a list of peptides and a public dataset identifier as input and returns the expression level of each protein the peptide was found in. pepX also provides aggregated expression levels for peptides that can be retrieved from multiple transcripts and proteins. The expression levels can be retrieved from a number of public databases, including The Cancer Genome Atlas (TCGA), CCLE (The Cancer Cell Line Encyclopedia) (<xref ref-type="bibr" rid="B7">Ghandi et al., 2019</xref>), HPA (The Human Protein Atlas) (<xref ref-type="bibr" rid="B21">Uhlen et al., 2010</xref>), and GTEx (The Genotype-Tissue Expression Project) (<xref ref-type="bibr" rid="B5">Carithers and Moore, 2015</xref>). These datasets provide expression values on the gene level as well as the transcript level. We used pepX to investigate different ways of estimating peptide abundance and the differences in using gene-level and transcript-level TPM values.</p>
<p>As illustrated in <xref ref-type="fig" rid="F3">Figure 3A</xref>, it is possible that the exact same peptide can be found in different proteins encoded by different genes (e.g., Peptide A in <xref ref-type="fig" rid="F3">Figure 3A</xref> can be retrieved from two proteins of Gene A and from one protein of Gene B). To analyze the extent of this, we considered the set of unique peptides in the HLA Ligand Atlas (n &#x3d; 223,246) (<xref ref-type="bibr" rid="B13">Marcu et al., 2021</xref>) and investigated the number of possible source proteins for each peptide. We found that 88% of peptides can be retrieved from protein sequences corresponding to exactly one Ensemble gene id. We investigated the remaining 12% of peptides that could be retrieved from different gene ids and found that, for the majority of cases (96%), the corresponding genes belonged to the same gene family. As gene families are formed by duplication of a single original gene, genes that are categorized into families usually share nucleotide and protein sequences. It is thus not surprising that a peptide can occur in multiple proteins that are encoded by genes that are part of a gene family. It is, however, not clear how the abundance of such peptides should be measured, as there are several options: 1) using the median TPM of all genes, 2) using the maximum TPM among all genes, or 3) summing up the TPM values of all genes.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Considerations for retrieving peptide abundance levels. Due to alternative splicing, genes can produce multiple different proteins. <bold>(A)</bold> The different protein sequences usually share amino acid stretches encoded by the same exons. It is also possible, that different genes share amino acid stretches, particularly genes from the same gene family. Peptide A (highlighted in red) for example can be retrieved from two proteins of Gene A and from one protein of Gene B, while Peptide B (highlighted in blue) can be retrieved from three proteins of Gene. <bold>(B)</bold>. Performance comparison of different ways to aggregate TPM values of multiple source proteins in distinguishing ligands of the HLA Ligand Atlas from decoy peptides. Summing up TPM values (total TPM) from all genes a peptide can be retrieved from performs best, followed by using the maximum TPM of all genes (Wilcoxon Test, p&#x2264;.0001). <bold>(C)</bold>. Performance comparison of scaling the TPM values considering the number of proteins a gene encodes and the number of proteins a peptide occurs in for ligands from the HLA Ligand Atlas. The total TPM significantly outperformed the total scaled TPM values (Wilcoxon Test, p&#xa0;&#x2264;&#xa0;.0001). <bold>(D)</bold>. Ligand elution datasets used in this study and the expression datasets we used to retrieve abundance levels. <bold>(E)</bold>. Performance comparison of different ways to aggregate TPM values of multiple source proteins in distinguishing ligands of the six validation datasets from decoy peptides. <bold>(F)</bold>. Performance comparison of total TPM and total scaled TPM proteins in distinguishing ligands of the six validation datasets from decoy peptides.</p>
</caption>
<graphic xlink:href="fgene-14-1082168-g003.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>Summing up expression levels from different source antigens provides the most accurate estimation of peptide abundance</title>
<p>We used peptides from the HLA Ligand Atlas for validation and analysis of pepX performance. The HLA Ligand Atlas contains tissue-specific HLA ligands from 23 healthy tissue types (<xref ref-type="bibr" rid="B13">Marcu et al., 2021</xref>). The dataset consisted of 223,246 non-redundant peptides and 675,346 peptide tissue pairs. We wanted to investigate how the abundance level of these peptides could be best estimated using gene-level TPM values. We generated a set of decoy peptides by randomly selecting length-matched peptides from the human proteome and assessed the performance of different metrics in distinguishing true ligands from the set of random decoy peptides. The Human Protein Atlas was used as the expression dataset by matching the tissue types represented in the HLA Ligand Atlas <bold>(</bold>
<xref ref-type="sec" rid="s10">Supplementary Table S1</xref>
<bold>)</bold>.</p>
<p>We evaluated all three options for aggregating expression values across different source antigens: 1) using the median TPM of all genes, 2) using the maximum TPM among all genes, or 3) summing up the TPM values of all genes. We used pepX to identify from which proteins each peptide could be derived from and retrieved expression levels of those source proteins. We evaluated the performance of each TPM aggregation method in distinguishing peptides from the HLA Ligand Atlas from the set of decoy peptides. The Area under the Receiver Operating Characteristics (ROC) Curve (AUC) was used to measure performance. We found that the values of the three metrics significantly vary from each other (Kruskal-Wallis test, <italic>p</italic> &#x3d; .01034) and that with a mean AUC of .805, summing up TPM values (total TPM) performs best, followed by using the maximum TPM among all genes (mean AUC &#x3d; .803) and using the median TPM of all genes (mean AUC &#x3d; .785, Wilcoxon test, <italic>p</italic> &#x3c; .0001, <xref ref-type="fig" rid="F3">Figure 3B</xref>).</p>
<p>Another detail we wanted to investigate was that each gene can be transcribed into different transcripts and thus translated into different proteins. The different transcripts correspond to different splice variants that are found in different tissue types, developmental stages, <italic>etc.</italic> However, the expression values of the different transcripts of a gene are collapsed into a single expression value when generating gene-level expression data. We hypothesized that, when using such gene-level TPM data, it might be important to consider in how many of a gene&#x2019;s transcripts the peptide occurs (e.g., Peptide B in <xref ref-type="fig" rid="F3">Figure 3A</xref> occurs in three different proteins from Gene A, while Peptide C occurs only in one). We developed a &#x2018;scaled TPM&#x2019;, which considers the number of proteins in which the peptide is found and the total number of proteins for the gene (detailed in the Methods section). However, with a mean AUC of .802, this scaled TPM did not improve the performance in predicting peptides from the HLA Ligand Atlas (<xref ref-type="fig" rid="F3">Figure 3C</xref>).</p>
<p>To validate these findings, we gathered additional ligand elution datasets and matched them to their corresponding expression datasets (<xref ref-type="fig" rid="F3">Figure 3D</xref>). Again, the total gene TPM significantly outperformed the median gene TPM (Wilcoxon Test, <italic>p</italic> &#x3c; .05, <xref ref-type="fig" rid="F3">Figure 3E</xref>). The total gene TPM was also slightly higher than the maximum TPM in these datasets; the difference was, however, not significant. Also, the values of the three metrics were distributed in a similar way (Kruskal-Wallis test, <italic>p</italic> &#x3d; .4637). Scaling the TPM values considering the number of proteins in which the peptide is found did again not improve performance <bold>(</bold>Wilcoxon Test, <italic>p</italic> &#x3e; .05 <xref ref-type="fig" rid="F3">Figure 3F</xref>).</p>
<p>It is also possible that a peptide occurs multiple times in a single protein sequence, e.g., in the case of repeating amino acid sequences (<xref ref-type="bibr" rid="B12">Luo and Nijveen, 2014</xref>). In our calculations above, such peptides were only counted once and we did not see an increase in performance when we considered duplicate peptides in a protein (<xref ref-type="sec" rid="s10">Supplementary Figure S3</xref>). This is likely due to the fact that 99.6% of the peptides in the human proteome do not occur multiple times in the same protein.</p>
<p>Taken together, we used pepX successfully to retrieve abundance levels of peptides&#x2019; source proteins and showed that in cases where peptides can be retrieved from multiple proteins, summing up the TPM values of the encoding genes performs best in distinguishing ligands from decoy peptides.</p>
</sec>
<sec id="s3-4">
<title>Transcript-level TPM data provides a more accurate estimation of peptide abundance than gene-level TPM data</title>
<p>We next investigated how well peptide abundance can be estimated using transcript-level instead of gene-level TPM values. We again used pepX to retrieve transcript-level TPM values from HPA for the peptides from the HLA Ligand Atlas and the set of random decoy peptides. We evaluated the performance of using 1) the median TPM of all transcripts, 2) using the maximum TPM among all transcripts, and 3) summing up TPM values of all transcripts. Similar to what we have observed when using gene-level TPM, summing up the TPM values of all transcripts (total TPM) significantly outperformed the median and the maximum TPM in the HLA Ligand Atlas dataset (Kruskal-Wallis test, <italic>p</italic> &#x3c; .0001, Wilcoxon Test, p&#x2264;.0001, <xref ref-type="fig" rid="F4">Figure 4A</xref>). In the validation datasets, the total TPM also significantly outperformed the median TPM (Kruskal-Wallis test, <italic>p</italic> &#x3c; .05, Wilcoxon Test, <italic>p</italic> &#x3c; .05, <xref ref-type="fig" rid="F4">Figure 4B</xref>). With a mean AUC of .816 the total TPM performed slightly better than the maximum TPM with a mean AUC .814; the difference was, however, not significant (Wilcoxon Test, <italic>p</italic> &#x3e; .05).</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>
<bold>(A)</bold>. Performance comparison of different ways to aggregate transcript-level TPM values of multiple source proteins in distinguishing ligands of the HLA Ligand Atlas from decoy peptides. Summing up TPM values (total TPM) from all transcripts a peptide can be retrieved from performs best, followed by using the maximum TPM of all transcripts (Wilcoxon Test, p&#x2264;.0001). <bold>(B)</bold>. Performance comparison of different ways to aggregate transcript-level TPM values of multiple source proteins in distinguishing ligands of the four validation datasets from decoy peptides. <bold>(C)</bold>. Performance comparison of using transcript-level and gene-level TPM values in distinguishing ligands of the HLA Ligand Atlas from decoy peptides. The total transcript-level TPM significantly outperformed the total gene-level TPM (Wilcoxon Test, p&#x2264;.0001). <bold>(D)</bold>. Performance comparison of using transcript-level and gene-level TPM values in distinguishing ligands of the four validation datasets from decoy peptides. The mean AUC of the total transcript-level TPM is higher the total gene-level TPM, however not significantly (Wilcoxon Test, <italic>p</italic> &#x3e; .05).</p>
</caption>
<graphic xlink:href="fgene-14-1082168-g004.tif"/>
</fig>
<p>Comparing the total gene TPM and the total transcript TPM for the peptides from HLA Ligand Atlas and the set of random decoy peptides showed that transcript-level TPM values perform significantly better than gene-level TPM values <bold>(</bold>Wilcoxon Test, p&#x2264;.001, <xref ref-type="fig" rid="F4">Figure 4C</xref>). On the validation datasets, with a mean AUC of .816, transcript-level total TPM performed better than the gene-level total TPM with an AUC of .811, however not significantly (Wilcoxon Test, <italic>p</italic> &#x3e; .05, <xref ref-type="fig" rid="F4">Figure 4D</xref>).</p>
<p>The Genotype-Tissue Expression Project (GTEx) is another database that provides tissue-specific gene expression data from healthy tissue samples. We wanted to compare the performance of using GTEx and HPA transcript-level expression data for estimating the abundance of the peptides from the HLA Ligand Atlas. We focused on the 18 tissue types that we could clearly match between the three datasets <bold>(</bold>
<xref ref-type="sec" rid="s10">Supplementary Table S1</xref>). For this subset of peptides, using TPM values from HPA significantly outperformed using TPM values from GTEx (AUC of .812 vs. .805, Wilcoxon Test <italic>p</italic> &#x3c; .01, <bold>S</bold>
<xref ref-type="sec" rid="s10">Supplementary Figure S4</xref>).</p>
<p>All expression databases provide transcript-level TPM values calculated with RSEM (<xref ref-type="bibr" rid="B11">Li and Dewey, 2011</xref>). The TCGA also provided TPM values calculated with Kallisto (<xref ref-type="bibr" rid="B2">Bray et al., 2016</xref>). We compared the performance of the two metrics using the Pyke Cancer dataset. The two metrics performed very similarly: the TPM calculated using RSEM had an AUC of .787 and the one calculated using Kallisto had an AUC of .786.</p>
<p>Taken together, we have shown here that, if available, transcript-level TPM data should be used to estimate peptide abundance, regardless if RSEM or Kallisto was used to calculate the TPM values. In the case of expression data of healthy tissue, HPA seems to be slightly more accurate for estimating peptide abundance of ligands eluted from healthy tissue samples.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>The abundance of a peptide&#x2019;s source antigen can play an important factor in predicting the likelihood that the peptide is a ligand and an epitope that is recognized by T cells, as it has previously been demonstrated that high peptide abundance can compensate for poor binding affinity (<xref ref-type="bibr" rid="B10">Kosaloglu-Yalcin et al., 2022</xref>). Although several novel prediction tools now integrate the expression levels of source antigens, there were, to our knowledge, no web tools available that can help users in retrieving such data from numerous public databases. We developed pepX to fill this gap. We also formally analyzed the different ways expression data can be retrieved and aggregated to most accurately estimate peptide abundance. We successfully used pepX to estimate peptide abundance for ligands from several datasets of eluted ligands and showed that summing up the transcript-level TPM values of different possible source proteins provides the most accurate estimation.</p>
<p>Of the 130,949 peptides of length 8&#x2013;14 in the HLA Ligand Atlas, 5,332 peptides could not be matched using pepX. This might be due to differences between the reference proteome sources, versions, and filters applied in each project. For instance, the HLA Ligand Atlas relied on the SwissProt human reference proteome while pepX makes use of the Ensembl reference. We are currently exploring ways to further expand the Universe of peptides that can be quantified using pepX. To enable the quantification of MHC class II presented peptides, we have included 15mer peptides in pepX and are planning to include peptide lengths of up to 25 in the next version of pepX.</p>
<p>pepX currently outputs Ensembl identifiers (gene, transcript, and protein ids) and HGNC gene symbols. These identifiers can be mapped with an external id mapping tool, such as provided by UniProt (<xref ref-type="bibr" rid="B22">UniProt Consortium, 2019</xref>) or Biomart (<xref ref-type="bibr" rid="B19">Smedley et al., 2009</xref>). We included six gene expression datasets in pepX, namely HPA, GTEx, TCGA, CCLE, and RNA-Seq data of a B721.221 cell line. We anticipate adding more datasets, including expression values from mouse samples. We also plan to provide the option to upload custom TPM tables, e.g., from patient RNA-Seq, which can be used to retrieve peptide abundance estimates.</p>
<p>Including peptide abundance was shown to improve accuracy when predicting naturally eluted ligands, cancer epitopes, and epitopes from infectious diseases such as SARS-CoV-2 (<xref ref-type="bibr" rid="B17">Sarkizova et al., 2020</xref>; <xref ref-type="bibr" rid="B6">Garcia Alvarez et al., 2022</xref>; <xref ref-type="bibr" rid="B10">Kosaloglu-Yalcin et al., 2022</xref>). pepX can be used in combination with epitope prediction tools, that consider peptide abundance, such as HLAthena (<xref ref-type="bibr" rid="B17">Sarkizova et al., 2020</xref>), AXEL-F (7), and NetMHCpanExp (<xref ref-type="bibr" rid="B6">Garcia Alvarez et al., 2022</xref>): the user would first use pepX to retrieve peptide abundance values and use those results as an input for their preferred epitope prediction tool. In a future release of the IEDB Analysis Resource, we also plan to add the option to pipe pepX results directly to epitope prediction tools. We are also working on allowing users to upload custom TPM values to be used when annotating the uploaded peptides.</p>
<p>As pepX is built from the human reference proteome, it is currently not possible to search for mutated peptides, e.g., neoantigens. In a future version of pepX we are planning to provide the option to search for mutated peptides as well by incorporating an initial scan with PepMatch (manuscript under review). PepMatch is a sequence comparison tool we developed that searches a given proteome for exact peptide matches, matches with a defined tolerance for mismatching residues, and best matches. PEPMatch uses a k-mer mapping algorithm, which preprocesses proteomes prior to searching, and achieves a 50-fold increase in speed over algorithms, such as BLAST, while also guaranteeing accurate results. Combined with the option to upload patient-specific RNA-Seq data, this will make pepX highly valuable in selecting and prioritizing neoantigens for immunotherapeutic approaches.</p>
<p>pepX is freely available at <ext-link ext-link-type="uri" xlink:href="http://tools.iedb.org/pepx">http://tools.iedb.org/pepx</ext-link> and will be periodically updated to include additional features that provide more utility.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/<xref ref-type="sec" rid="s10">Supplementary Material</xref>.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>Study concept and design, ZK-Y; Acquisition of Data, ZK-Y; Implementation, AF and JG; Data Analysis and Interpretation, ZK-Y, BP, and JG; Web app implementation, HK; Writing of the Manuscript, ZK-Y and AF; All authors reviewed and edited the manuscript.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>Research reported in this publication was supported by the National Cancer Institute of the National Institutes of Health under award number U24CA248138 and by the National Institute of Allergy and Infectious Diseases (NIAID) under award number 75N93019C00001.</p>
</sec>
<ack>
<p>We thank Drs Alessandro Sette and Morten Nielsen for helpful discussions.</p>
</ack>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2023.1082168/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2023.1082168/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abelin</surname>
<given-names>J. G.</given-names>
</name>
<name>
<surname>Keskin</surname>
<given-names>D. B.</given-names>
</name>
<name>
<surname>Sarkizova</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hartigan</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Sidney</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Mass spectrometry profiling of HLA-associated peptidomes in mono-allelic cells enables more accurate epitope prediction</article-title>. <source>Immunity</source> <volume>46</volume> (<issue>2</issue>), <fpage>315</fpage>&#x2013;<lpage>326</lpage>. <pub-id pub-id-type="doi">10.1016/j.immuni.2017.02.007</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bray</surname>
<given-names>N. L.</given-names>
</name>
<name>
<surname>Pimentel</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Melsted</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pachter</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Near-optimal probabilistic RNA-seq quantification</article-title>. <source>Nat. Biotechnol.</source> <volume>34</volume> (<issue>5</issue>), <fpage>525</fpage>&#x2013;<lpage>527</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.3519</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cancer Genome Atlas Research</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Weinstein</surname>
<given-names>J. N.</given-names>
</name>
<name>
<surname>Collisson</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Mills</surname>
<given-names>G. B.</given-names>
</name>
<name>
<surname>Shaw</surname>
<given-names>K. R.</given-names>
</name>
<name>
<surname>Ozenberger</surname>
<given-names>B. A.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>The cancer genome Atlas pan-cancer analysis project</article-title>. <source>Nat. Genet.</source> <volume>45</volume> (<issue>10</issue>), <fpage>1113</fpage>&#x2013;<lpage>1120</lpage>. <pub-id pub-id-type="doi">10.1038/ng.2764</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cantarella</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Carnevali</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Morselli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Conti</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pellegrini</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Montanini</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Alu RNA modulates the expression of cell cycle genes in human fibroblasts</article-title>. <source>Int. J. Mol. Sci.</source> <volume>20</volume> (<issue>13</issue>), <fpage>3315</fpage>. <pub-id pub-id-type="doi">10.3390/ijms20133315</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Carithers</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>H. M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>The genotype-tissue expression (GTEx) project</article-title>. <source>Biopreserv Biobank</source> <volume>13</volume> (<issue>5</issue>), <fpage>307</fpage>&#x2013;<lpage>308</lpage>. <pub-id pub-id-type="doi">10.1089/bio.2015.29031.hmm</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Garcia Alvarez</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Kosaloglu-Yalcin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Nielsen</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>The role of antigen expression in shaping the repertoire of HLA presented ligands</article-title>. <source>iScience</source> <volume>25</volume> (<issue>9</issue>), <fpage>104975</fpage>. <pub-id pub-id-type="doi">10.1016/j.isci.2022.104975</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ghandi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>F. W.</given-names>
</name>
<name>
<surname>Jane-Valbuena</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kryukov</surname>
<given-names>G. V.</given-names>
</name>
<name>
<surname>Lo</surname>
<given-names>C. C.</given-names>
</name>
<name>
<surname>McDonald</surname>
<given-names>E. R.</given-names>
<suffix>3rd</suffix>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Next-generation characterization of the cancer cell line Encyclopedia</article-title>. <source>Nature</source> <volume>569</volume> (<issue>7757</issue>), <fpage>503</fpage>&#x2013;<lpage>508</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-019-1186-3</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goldman</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Craft</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hastie</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Repecka</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>McDade</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Kamath</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Visualizing and interpreting cancer genomics data via the Xena platform</article-title>. <source>Nat. Biotechnol.</source> <volume>38</volume> (<issue>6</issue>), <fpage>675</fpage>&#x2013;<lpage>678</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-020-0546-8</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hugo</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zaretsky</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Moreno</surname>
<given-names>B. H.</given-names>
</name>
<name>
<surname>Hu-Lieskovan</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Genomic and transcriptomic features of response to anti-PD-1 therapy in metastatic melanoma</article-title>. <source>Cell</source> <volume>165</volume> (<issue>1</issue>), <fpage>35</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2016.02.065</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kosaloglu-Yalcin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Greenbaum</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schoenberger</surname>
<given-names>S. P.</given-names>
</name>
<name>
<surname>Miller</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y. J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Combined assessment of MHC binding and antigen abundance improves T cell epitope predictions</article-title>. <source>iScience</source> <volume>25</volume> (<issue>2</issue>), <fpage>103850</fpage>. <pub-id pub-id-type="doi">10.1016/j.isci.2022.103850</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Dewey</surname>
<given-names>C. N.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Rsem: Accurate transcript quantification from RNA-seq data with or without a reference genome</article-title>. <source>BMC Bioinforma.</source> <volume>12</volume>, <fpage>323</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-12-323</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Nijveen</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Understanding and identifying amino acid repeats</article-title>. <source>Brief. Bioinform</source> <volume>15</volume> (<issue>4</issue>), <fpage>582</fpage>&#x2013;<lpage>591</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbt003</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marcu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bichmann</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kuchenbecker</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kowalewski</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Freudenmann</surname>
<given-names>L. K.</given-names>
</name>
<name>
<surname>Backert</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>HLA ligand Atlas: A benign reference of HLA-presented peptides to improve T-cell-based cancer immunotherapy</article-title>. <source>J. Immunother. Cancer</source> <volume>9</volume> (<issue>4</issue>), <fpage>e002071</fpage>. <pub-id pub-id-type="doi">10.1136/jitc-2020-002071</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peters</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Nielsen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sette</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>T cell epitope predictions</article-title>. <source>Annu. Rev. Immunol.</source> <volume>38</volume>, <fpage>123</fpage>&#x2013;<lpage>145</lpage>. <pub-id pub-id-type="doi">10.1146/annurev-immunol-082119-124838</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pyke</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Mellacheruvu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Dea</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Abbott</surname>
<given-names>C. W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S. V.</given-names>
</name>
<name>
<surname>Phillips</surname>
<given-names>N. A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Precision neoantigen discovery using large-scale immunopeptidomes and composite modeling of MHC peptide presentation</article-title>. <source>Mol. Cell Proteomics</source> <volume>20</volume>, <fpage>100111</fpage>. <pub-id pub-id-type="doi">10.1016/j.mcpro.2021.100111</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Riaz</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Havel</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Makarov</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Desrichard</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Urba</surname>
<given-names>W. J.</given-names>
</name>
<name>
<surname>Sims</surname>
<given-names>J. S.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Tumor and microenvironment evolution during immunotherapy with nivolumab</article-title>. <source>Cell</source> <volume>171</volume> (<issue>4</issue>), <fpage>934</fpage>&#x2013;<lpage>949</lpage>. <comment>e16</comment>. <pub-id pub-id-type="doi">10.1016/j.cell.2017.09.028</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sarkizova</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Klaeger</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>P. M.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L. W.</given-names>
</name>
<name>
<surname>Oliveira</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Keshishian</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>A large peptidome dataset improves HLA class I epitope prediction across most of the human population</article-title>. <source>Nat. Biotechnol.</source> <volume>38</volume> (<issue>2</issue>), <fpage>199</fpage>&#x2013;<lpage>209</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-019-0322-9</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shinkawa</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tokita</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nakatsugawa</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kikuchi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kanaseki</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Torigoe</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Characterization of CD8(&#x2b;) T-cell responses to non-anchor-type HLA class I neoantigens with single amino-acid substitutions</article-title>. <source>Oncoimmunology</source> <volume>10</volume> (<issue>1</issue>), <fpage>1870062</fpage>. <pub-id pub-id-type="doi">10.1080/2162402X.2020.1870062</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smedley</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Haider</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ballester</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Holland</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>London</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Thorisson</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2009</year>). <article-title>BioMart-biological queries made easy</article-title>. <source>BMC Genomics</source> <volume>10</volume>, <fpage>22</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2164-10-22</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trolle</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>McMurtrey</surname>
<given-names>C. P.</given-names>
</name>
<name>
<surname>Sidney</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bardet</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Osborn</surname>
<given-names>S. C.</given-names>
</name>
<name>
<surname>Kaever</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>The length distribution of class I-restricted T cell epitopes is determined by both peptide supply and MHC allele-specific binding preference</article-title>. <source>J. Immunol.</source> <volume>196</volume> (<issue>4</issue>), <fpage>1480</fpage>&#x2013;<lpage>1487</lpage>. <pub-id pub-id-type="doi">10.4049/jimmunol.1501721</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Uhlen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Oksvold</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Fagerberg</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lundberg</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Jonasson</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Forsberg</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>Towards a knowledge-based human protein Atlas</article-title>. <source>Nat. Biotechnol.</source> <volume>28</volume> (<issue>12</issue>), <fpage>1248</fpage>&#x2013;<lpage>1250</lpage>. <pub-id pub-id-type="doi">10.1038/nbt1210-1248</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<collab>UniProt Consortium</collab> (<year>2019</year>). <article-title>UniProt: A worldwide hub of protein knowledge</article-title>. <source>Nucleic Acids Res.</source> <volume>47</volume> (<issue>D1</issue>), <fpage>D506</fpage>&#x2013;<lpage>D515</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gky1049</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vita</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Mahajan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Overton</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Dhanda</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Martini</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cantrell</surname>
<given-names>J. R.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>The immune epitope database (IEDB): 2018 update</article-title>. <source>Nucleic Acids Res.</source> <volume>47</volume> (<issue>1</issue>), <fpage>D339</fpage>&#x2013;<lpage>D343</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gky1006</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>