<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="methods-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1630161</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2025.1630161</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Methods</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Quantitative measures to assess the quality of cellular indexing of transcriptomes and epitopes by sequencing data</article-title>
<alt-title alt-title-type="left-running-head">Sun et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2025.1630161">10.3389/fbinf.2025.1630161</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Sun</surname>
<given-names>Jie</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x26; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Morrison</surname>
<given-names>Robert</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x26; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kim</surname>
<given-names>Soyeon</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/888663"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yan</surname>
<given-names>Kairuo</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3104223"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x26; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Park</surname>
<given-names>Hyun Jung</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/834895"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x26; editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Department of Human Genetics, School of Public Health, University of Pittsburgh</institution>, <city>Pittsburgh</city>, <addr-line>PA</addr-line>, <country country="US">United States</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Department of Medicine and Division of Hematology/Oncology, University of Pittsburgh, School of Medicine</institution>, <city>Pittsburgh</city>, <addr-line>PA</addr-line>, <country country="US">United States</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>Department of Immunology, University of Pittsburgh, School of Medicine</institution>, <city>Pittsburgh</city>, <addr-line>PA</addr-line>, <country country="US">United States</country>
</aff>
<aff id="aff4">
<label>4</label>
<institution>Department of Computational and Systems Biology, University of Pittsburgh Medical Center</institution>, <city>Pittsburgh</city>, <addr-line>PA</addr-line>, <country country="US">United States</country>
</aff>
<aff id="aff5">
<label>5</label>
<institution>Division of Pulmonary Medicine, Department of Pediatrics, UPMC Children&#x2019;s Hospital of Pittsburgh, University of Pittsburgh</institution>, <city>Pittsburgh</city>, <addr-line>PA</addr-line>, <country country="US">United States</country>
</aff>
<aff id="aff6">
<label>6</label>
<institution>Department of Computer Science, Northeastern University</institution>, <city>Boston</city>, <addr-line>MA</addr-line>, <country country="US">United States</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Hyun Jung Park, <email>hyp15@pitt.edu</email>
</corresp>
<fn fn-type="equal" id="fn001">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-09-18">
<day>18</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>5</volume>
<elocation-id>1630161</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>07</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Sun, Morrison, Kim, Yan and Park.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Sun, Morrison, Kim, Yan and Park</copyright-holder>
<license>
<ali:license_ref start_date="2025-09-18">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>Cellular indexing of transcriptomes and epitopes by sequencing (CITE-Seq) is a powerful technique to simultaneously measure gene expression and cell surface protein abundances in individual cells. To obtain accurate and reliable biological findings from CITE-Seq data, it is critical to ensure rigorous quality control (QC). However, no public method has yet been developed for CITE-Seq QC.</p>
</sec>
<sec>
<title>Results</title>
<p>In this study, we propose the first software package for multi-layered, systemic, and quantitative quality control (CITESeQC). Recognizing the multi-layered nature of CITE-Seq data, CITESeQC performs QC across gene expressions, surface proteins, and their interactions. It systemically evaluates all genes and protein markers assayed in the data and filters out some of them based on individual quality measures. Furthermore, for quantitative QC that enables objective and standardized analyses, CITESeQC quantifies cell type-specific expression of genes and surface proteins using Shannon entropy and correlation-based measures. Finally, to ensure broad applicability, CITESeQC guides users through a simple process that generates a complete markdown report with supporting figures and explanations, requiring minimal user intervention.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>By quantifying the quality of CITE-Seq data, CITESeQC enables precise characterization of gene expression within cell types and reliable classification of cell types using surface protein markers, thereby enhancing its value for clinical applications.</p>
</sec>
</abstract>
<kwd-group>
<kwd>CITE-Seq</kwd>
<kwd>quality control (QC)</kwd>
<kwd>multi-omics integration</kwd>
<kwd>biomarker discovery</kwd>
<kwd>computational software</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by the UPMC Hillman Cancer Center Biostatistics Shared Resource, which is supported in part by award P30CA047904 and R01GM108618 from the NIH. This work was also supported by the Hillman Cancer Center Career Enhancement Program Award (P50 CA254865-01).</funding-statement>
</funding-group>
<custom-meta-group>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Single Cell Bioinformatics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Background</title>
<p>While traditional single-cell RNA-seq techniques assay only gene expression by capturing and sequencing RNA molecules, cellular indexing of transcriptomes and epitopes by sequencing (CITE-Seq) assays both RNA molecules and surface proteins of interest simultaneously by utilizing unique DNA-barcoded antibodies, also known as &#x201c;antibody-derived tags (ADTs).&#x201d; Since cell surface proteins serve as markers and communicators of a cell&#x2019;s identity and function, CITE-Seq data enable the identification not only of cell-type specific gene expression patterns but also of cell types defined by specific surface proteins that may be used for further clinical applications. For example, although some immune cell types, such as &#x3b3;/&#x3b4; T cells (<xref ref-type="bibr" rid="B18">Zakeri et al., 2022</xref>), mucosal-associated invariant T cells (<xref ref-type="bibr" rid="B10">Li et al., 2023</xref>), innate lymphoid cells (ILCs) (<xref ref-type="bibr" rid="B8">Jacquelot et al., 2022</xref>), and neutrophils (<xref ref-type="bibr" rid="B3">Geh et al., 2022</xref>), have demonstrated significant clinical potential, single-cell RNA-seq data alone are often insufficient to detect them reliably. This limitation arises from the potentially low RNA content of lineage-defining transcripts (<xref ref-type="bibr" rid="B15">Stoeckius et al., 2017</xref>), the presence of high levels of RNase (<xref ref-type="bibr" rid="B6">Hao et al., 2021</xref>; <xref ref-type="bibr" rid="B11">Mazzurana et al., 2021</xref>; <xref ref-type="bibr" rid="B14">Scheyltjens et al., 2022</xref>), or the fact that mRNA expression patterns do not always correlate with protein expression (<xref ref-type="bibr" rid="B15">Stoeckius et al., 2017</xref>).</p>
<p>To ensure high-quality discoveries from CITE-Seq data, the first critical step is to control the quality (QC) of the input CITE-Seq data. For QC of CITE-Seq data, previous studies performed a limited set of analyses, and there was no standalone method. To develop a desirable standalone method for CITE-Seq QC, we recognize the following three limitations in the current CITE-Seq studies. First, some studies performed QC only at the RNA level, e.g., in terms of either transcriptome library size (<xref ref-type="bibr" rid="B1">Butler et al., 2018</xref>; <xref ref-type="bibr" rid="B16">Stuart et al., 2021</xref>), transcriptomic technical artifacts such as RNA contamination (<xref ref-type="bibr" rid="B7">Hong et al., 2022</xref>), or likely empty droplets or ambient RNAs (<xref ref-type="bibr" rid="B5">Grob et al., 2023</xref>; <xref ref-type="bibr" rid="B17">Subramanian et al., 2022</xref>). However, since CITE-Seq assays both RNA and cell surface protein data, CITE-Seq QC must assess not only individual RNA quality but also the quality of protein data and their interactions with RNA data. Specifically, i) the individual protein and RNA data quality must be controlled, respectively, to faithfully identify cell types with certain surface proteins and capture the cells&#x2019; molecular profiles, and ii) the relationship between the RNAs and the proteins must be investigated since, if certain cells express a specific gene that is readily translated and transported to the surface, the surface protein abundance level is expected to be correlated with the gene expression in the cells. Second, while a small number of other studies used surface protein information for QC, they examined only a subset of the assayed surface proteins as they were interested in particular cell types marked by the surface proteins. For example, one study examined 7 protein markers (CD3, CD4, CD8, CD14, CD16, CD19, and CD56) out of 188 available markers in the data to differentiate five cell types (B cells, CD4 T cells, CD8 T cells, classical monocytes, and natural killer) (<xref ref-type="bibr" rid="B12">Nettersheim et al., 2022</xref>), and another study examined four protein markers, out of 17 available markers, to differentiate four cell types (T cells, monocytes, B cells, and cytotoxic T lymphocytes) (<xref ref-type="bibr" rid="B4">Granja et al., 2019</xref>). However, to detect systematic errors that affect most assays in the data, it is important to examine the majority of RNAs and proteins rather than a small subset of them. Third, when the abovementioned studies demonstrated the relationship between genes and the corresponding proteins, they relied mostly on visual inspection of a dimensionality-reduced space (e.g., UMAP) for either the abundance level relationship between genes and proteins or their cell-type specificity. However, quantitative measures are needed to objectively assess the relationship between abundance levels and cell-type specificity. Quantitative measures can help further compare the data quality across various CITE-Seq datasets and make the QC analyses scalable.</p>
<p>In this study, we introduce CITESeQC, the first software package specifically designed to provide a comprehensive and interpretable set of quantitative metrics for assessing the quality of CITE-Seq data. Rather than performing direct filtering or removal of cells or features, CITESeQC serves as a diagnostic framework that guides users in making informed quality control (QC) decisions tailored to their dataset. CITESeQC supports multi-layered QC by offering seven modules for evaluating RNA or protein data individually and five additional modules for assessing cross-modality relationships, such as RNA&#x2013;protein consistency. To ensure systematic coverage, these 12 modules collectively assess all genes and surface proteins in the dataset while flagging low-quality features using individual QC metrics. For quantitative evaluation, CITESeQC computes Shannon entropy to assess cell type-specific expression patterns and correlation coefficients to capture expected relationships between gene expression and protein abundance. Designed for broad usability, CITESeQC guides users through a streamlined process that generates a complete markdown report, including informative visualizations and interpretations, with minimal user intervention. This flexible, user-guided approach enables researchers to evaluate data quality in a nuanced and biologically informed manner&#x2014;supporting both standardized workflows and exploratory analyses&#x2014;without relying on rigid, pre-defined thresholds.</p>
</sec>
<sec sec-type="results" id="s2">
<title>Results</title>
<sec id="s2-1">
<title>CITESeQC quantifies various aspects of CITE-Seq quality</title>
<p>CITESeQC provides 12 R modules to assess the quality of RNAs, surface proteins, and their interactions in multiple aspects and one R module to define cell clusters or import cell cluster definitions (<xref ref-type="fig" rid="F1">Figure 1</xref>). The modules also provide quantitative measures, wherever possible, to test particular hypotheses regarding the quality.<list list-type="simple">
<list-item>
<p>1. &#x201c;RNA_read_corr()&#x201d; produces a scatterplot correlating the number of molecules/genes with the number of genes identified in the transcriptome. Since the cutoffs for good-quality cells will be passed as the arguments to the function, users can modify them for their data. Default values are from the Seurat-guided clustering tutorial. Spearman&#x2019;s correlation coefficient is calculated to allow users to test the hypothesis that the total number of genes increases with the number of detected genes in the transcriptome.</p>
</list-item>
<list-item>
<p>2. &#x201c;ADT_read_corr()&#x201d; produces a scatterplot correlating the number of detected ADTs with the total number of ADT molecules identified on the cell surfaces. Since the cutoffs identifying good-quality cells are annotated on the plot as passed as the arguments of the function, users can modify them for their data. Default values are from the Seurat-guided clustering tutorial. Spearman&#x2019;s correlation coefficient is calculated to allow users to test the hypothesis that the total number of ADT molecules increases with the number of detected ADTs on the cell surface.</p>
</list-item>
<list-item>
<p>3. &#x201c;RNA_mt_read_corr()&#x201d; produces a scatterplot correlating the number of genes identified in the transcriptome with the percentage of the mitochondrial genes. Spearman&#x2019;s correlation coefficient is calculated to allow users to test the hypothesis that the mitochondrial percentage remains constant regardless of the number of identified molecules.</p>
</list-item>
<list-item>
<p>4. &#x201c;def_clust()&#x201d; either defines the cell clusters based on the input gene expression matrix or imports the definition. To define the cell clusters, it employs the Seurat package with the input clustering resolution. For each cell cluster, whether defined internally or imported, this function identifies marker genes for later use.</p>
</list-item>
<list-item>
<p>5. &#x201c;RNA_dist()&#x201d; visualizes the specificity of the input gene expression across the cell clusters defined or imported using def_clust(). For quantification and comparison, it calculates Shannon entropy on the expression distribution across clusters, which is defined as follows: <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>z</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msub>
<mml:mi>log</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>log</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <italic>N</italic> is the number of clusters (size of the alphabet). A lower value in Shannon entropy represents a more specific expression of the gene across the clusters.</p>
</list-item>
<list-item>
<p>6. &#x201c;multiRNA_hist()&#x201d; is a histogram of Shannon entropy values of the marker genes identified in def_clust(). The histogram displays the specificity of marker genes across clusters. Users can modify the number of marker genes. A histogram peak at high entropy values suggests that the marker genes lack specificity.</p>
</list-item>
<list-item>
<p>7. &#x201c;ADT_dist()&#x201d; visualizes the specificity of the input ADT abundance across the cell clusters. Specifically, it calculates normalized Shannon entropy on the expression distribution across clusters. Note that the clusters were defined based on gene expression unless provided by the users.</p>
</list-item>
<list-item>
<p>8. &#x201c;multiADT_hist()&#x201d; is a histogram of normalized Shannon entropy values of all ADTs identified for the cell clusters. The histogram displays the specificity of ADT markers across clusters. Note that the clusters were defined based on gene expression unless provided by the users. A histogram peak at high entropy values suggests that the marker genes lack specificity.</p>
</list-item>
<list-item>
<p>9. &#x201c;RNA_ADT_read_corr()&#x201d; produces a scatterplot showing the correlation between the number of assayed genes in the transcriptome and the number of assayed cell surface proteins across the cells. Spearman&#x2019;s correlation coefficient is calculated to allow users to test the hypothesis that the number of assayed proteins increases with the number of assayed genes.</p>
</list-item>
<list-item>
<p>10. &#x201c;RNA_ADT_UMAP_corr()&#x201d; produces pairs of UMAP plots and a scatterplot. Each UMAP plot pair is drawn for the abundance of the input ADT and the corresponding gene expression, respectively. The scatterplot plots the abundance of ADTs and the expression of the RNAs of the input gene.</p>
</list-item>
<list-item>
<p>11. &#x201c;RNA_ADT_cluster_corr()&#x201d; is a set of scatterplots, each drawn for each cell cluster, showing the correlation between input ADT abundance and the corresponding gene expression for the cluster.</p>
</list-item>
<list-item>
<p>12. &#x201c;RNA_ADT_hist()&#x201d; is a histogram of the correlation coefficients in all pairs of ADTs and the corresponding genes in expression.</p>
</list-item>
<list-item>
<p>13. &#x201c;RNA_ADT_cluster_hist()&#x201d; is a set of histograms, each showing the distribution of the correlation coefficients in all pairs of ADTs and the corresponding genes for each cell cluster.</p>
</list-item>
</list>
</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Illustrative categorization of the 12 R functions in CITESeQC. The additional def_clust() function needs to be run before running functions in &#x201c;with cell clusters&#x201d; category.</p>
</caption>
<graphic xlink:href="fbinf-05-1630161-g001.tif">
<alt-text content-type="machine-generated">Grid diagram categorizing analyses by color: white for prerequisites, blue for correlation, red for distribution, and purple for distribution on correlation. Categories include RNA only, ADT only, and RNA&#x2b;ADT, further divided into analyses without and with cell clusters such as RNA_read_corr and RNA_dist.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-2">
<title>CITESeQC interpretation of diagnostic quality metrics</title>
<p>We demonstrate the applicability of CITESeQC using two example CITE-Seq datasets from healthy donors. The first comprises peripheral blood mononuclear cells (PBMCs), and the second comprises cord blood mononuclear cells (CBMCs). On the datasets, three functions beginning with either &#x201c;RNA&#x201d; or &#x201c;ADT&#x201d; and ending with &#x201c;read_corr&#x201d; inspect the correlation between the total number of reads and those aligned with RNAs or proteins across cells, enabling users to test whether the alignment process contributes to the quality. CITESeQC calculates Spearman&#x2019;s correlation coefficient and a permutation-based <italic>p</italic>-value as quantitative measures. Our analysis of PBMC and CBMC datasets (<xref ref-type="sec" rid="s11">Supplementary Figures S1A&#x2013;C, 2A&#x2013;C</xref>) confirms that a valid alignment should yield a positive correlation. The functions RNA_dist() and ADT_dist() compute the distribution of a single marker gene or surface protein across cell clusters using Shannon entropy to quantify target specificity. To illustrate their utility, we examined CCR7 and CST7 in PBMCs&#x2014;canonical markers for na&#xef;ve T cells and cytotoxic lymphocytes, respectively (<xref ref-type="fig" rid="F2">Figures 2A,B</xref>). Although both are recognized markers, Seurat&#x2019;s built-in module lacks the resolution to differentiate their relative specificity across clusters (<xref ref-type="fig" rid="F2">Figures 2C&#x2013;E</xref>). In contrast, our entropy-based quantification provides a clear, interpretable measure of specificity. For example, CCR7 is less specific than CST7 (with entropy values of 2.53 and 2.34, respectively), enabling researchers to prioritize CST7 over CCR7 for downstream analyses, such as cell-type annotation, differential expression, and experimental validation. This added layer of interpretability represents a key advantage over existing methods. We also showed the specificity of CCR7 in CBMC and CD14 ADT in PBMC and CBMC (S. <xref ref-type="fig" rid="F2">Figures 2D&#x2013;F</xref>). CD14 also shows strong specificity across PBMC and CBMC cell clusters as it is robustly expressed in classical and intermediate monocytes, with Shannon entropy values of 2.39 and 3.83, respectively. &#x201c;multiRNA_hist()&#x201d; and &#x201c;multiADT_hist()&#x201d; visualize the distribution of Shannon entropy values for marker genes and surface proteins, respectively. In our analysis, we used the top 10 marker genes for each cluster and all surface proteins identified in PBMC and CBMC (<xref ref-type="fig" rid="F2">Figures 2F,G,S</xref>; <xref ref-type="fig" rid="F2">Figures 2G,H</xref>). In addition, three functions beginning with &#x201c;RNA_ADT&#x201d; and ending with &#x201c;corr&#x201d; allow practitioners to quantify the correlation between RNAs and surface proteins. Our analysis of CD14 on PBMC and CCR7 on CBMC (<xref ref-type="sec" rid="s11">Supplementary Figures S1D&#x2013;G, 3, 4, 5</xref>) visually demonstrates their specificity across cell clusters on UMAP and using correlation. Finally, two functions beginning with &#x201c;RNA_ADT&#x201d; and ending with &#x201c;hist&#x201d; visualize the distribution of the correlation either across all clusters or for each cluster. Running the functions on CCR7 and ADT14 shows cluster-specific behavior of the markers (<xref ref-type="sec" rid="s11">Supplementary Figures S6, 7</xref>). Before running functions that require cell cluster definitions (e.g., RNA_dist()), def_clust() should be called to either define or import them.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>CITESeQC functions showing and quantifying relative abundance distribution of <bold>(A)</bold> CCR7 RNA and <bold>(B)</bold> CST7 in the example PBMC dataset. The amount of uncertainty in the probability distribution is measured by Shannon entropy. UMAP showing specificity across cell clusters for <bold>(C)</bold> CCR7 and <bold>(D)</bold> CST7. <bold>(E)</bold> UMAP showing the cell cluster definition in the PBMC dataset. CITESeQC functions showing the distribution of the Shannon entropy values of <bold>(F)</bold> the top 10 marker genes from each cluster and <bold>(G)</bold> all surface markers across the clusters defined in def_clust() on the example PBMC dataset.</p>
</caption>
<graphic xlink:href="fbinf-05-1630161-g002.tif">
<alt-text content-type="machine-generated">Panel A shows a line graph of Shannon entropy for the gene CCR7, with clusters labeled on the x-axis and entropy values on the y-axis. Panel B displays a similar graph for the gene CST7. Panels C and D are UMAP plots showing data distribution, with color gradients representing expression levels for different genes. Panel E is another UMAP plot, showing clusters in distinct colors, each with a numerical label. Panels F and G present bar charts depicting entropy frequency distributions with entropy on the x-axis and frequency on the y-axis.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-3">
<title>Systematic evaluation of CITESeQC&#x2019;s sensitivity to technical noise in CITE-Seq data</title>
<p>To show how CITESeQC detects systemic errors, we performed two controlled noise-injection experiments using 10% of the cells randomly selected in the PBMC dataset. First, to simulate noise introduced by systemic disruptions in feature-count relationships, we shuffled expression values for 5%, 10%, and 20% of RNA features and 10%, 20%, and 30% of ADT features. We selected higher percentages for ADT data to ensure a noticeable effect despite its smaller feature set (33,538 RNAs vs. 17 ADTs). For RNA, each condition was repeated 10 times; for ADT, 50 times for statistical significance and computational efficiency. To quantify the noise effect, CITESeQC calculates Spearman&#x2019;s correlation between nFeature (the number of unique genes or proteins detected in a cell) and nCount (total count per cell). In high-quality data, these metrics are expected to show a strong positive correlation&#x2014;cells with more detected features tend to have higher total counts. Our shuffling strategy is to preserve cell-level relationships while disrupting the gene- or protein-level relationships. In the results, we observed a consistent decrease in correlation values with increasing levels of noise for both RNA and ADT (<xref ref-type="fig" rid="F3">Figures 3A,B</xref>). The RNA modality showed a wider dynamic range of degradation due to its larger number of features. These results confirm that CITESeQC&#x2019;s correlation-based metrics are sensitive to global disruptions and can effectively capture systemic quality issues. Second, we evaluated how increasing randomness affects gene/protein specificity across clusters, a key step for downstream analyses. We randomly shuffled 10%, 20%, and 30% of RNA and ADT features, respectively, and defined clusters using the function def_clust(). For efficiency, we selected 10,000 RNA features by ranking genes according to the standard deviation of their expression across cells and retaining those with the highest variability. Using the defined clusters, we ran multiRNA_hist() and multiADT_hist() functions to calculate the Shannon entropy across all shuffled features. In high-quality data, markers with specificity should show low entropy. As we increased the level of noise, the entropy values exhibited a systematic increase, with the overall distribution shifting toward higher values (i.e., rightward shift). For RNA features, we observed significant shifts in Shannon entropy from 10% to 20% and from 20% to 30% (<italic>p</italic>-value: 0.04 and 0.05, respectively, <xref ref-type="fig" rid="F3">Figure 3C</xref>), suggesting a loss of cluster-specific expression patterns. A similar shift was found for ADT features, although it was not significant (<italic>p</italic>-value: 0.2 in both 10%&#x2013;20% and 20%&#x2013;30%, <xref ref-type="fig" rid="F3">Figure 3D</xref>), potentially due to the limited number of measured ADTs (n &#x3d; 17). These findings demonstrate that entropy-based metrics in CITESeQC effectively capture the erosion of biological signal due to random noise. Together, both experiments validate the sensitivity of CITESeQC to detect quality issues at multiple levels&#x2014;global structure and cluster specificity&#x2014;making it a valuable tool for CITE-Seq data QC across applications and platforms.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Spearman&#x2019;s correlation of 10% randomly selected cells in the PBMC dataset estimated between nFeature and nCount of <bold>(A)</bold> RNAs after 5%, 10%, and 20% shuffling and <bold>(B)</bold> all ADTs after 10%, 20%, and 30% shuffling. Shannon entropy density plot of marker genes in 10% randomly selected cells of the PBMC dataset estimated across defined clusters of <bold>(C)</bold> RNAs after 10%, 20%, and 30% shuffling and <bold>(D)</bold> all ADTs after 10%, 20%, and 30% shuffling.</p>
</caption>
<graphic xlink:href="fbinf-05-1630161-g003.tif">
<alt-text content-type="machine-generated">Graphs showing correlation and density analysis. (A) Box plots of Spearman correlation for RNA, with percentages shuffled, showing a decrease in correlation with increased shuffling. (B) Similar box plots for ADT, revealing a similar pattern with higher initial correlation. (C) Density plot for RNA showing Shannon entropy with varying distributions at different shuffled percentages. (D) Density plot for ADT displaying the impact of shuffling on Shannon entropy distribution, with notable peaks.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-4">
<title>CITESeQC facilitates marker specificity analysis</title>
<p>To demonstrate how CITESeQC&#x2019;s quantitative measures can improve downstream biological analysis, we systematically determined a Shannon entropy cutoff to assess the specificity of marker genes. Specifically, we focused on defining an empirical threshold that distinguishes truly cluster-specific markers from background, non-specific genes. To establish this threshold, we first randomly selected 1,000 expressed RNAs (&#x3e;5 in average expression) that were not differentially expressed across any clusters in the PBMC dataset to serve as a negative control. We then calculated the Shannon entropy of these non-marker genes across pre-defined clusters. Because these genes are expected to be broadly and non-specifically expressed, their entropy distribution reflects a null distribution of non-specific expression. We defined the marker specificity cutoff as the 5th percentile of this distribution (i.e., the left tail), identifying entropy values below this threshold as statistically specific. We then applied this empirical cutoff to evaluate the top 10, 20, and 30 RNA markers (ranked by differential expression p-value) identified in our analysis (<xref ref-type="sec" rid="s11">Supplementary Figure S8</xref>). Although the set with more RNA markers exhibits heterogeneous distribution of entropy values, the cutoff clearly distinguishes significantly specific markers from non-specific markers. In PBMCs, for example, entropy values below 1.45 were deemed specific, with 26 (20%), 39(16%), and 41 (12%) of the top 10, 20, and 30 markers, respectively, meeting this criterion (<xref ref-type="sec" rid="s11">Supplementary Table S1</xref>). In CBMCs, where the cutoff was 0.75, similar trends were observed. This analysis quantitatively validates which markers are truly specific to each cluster. By selecting cluster-specific markers based on CITESeQC entropy-based specificity, users can enhance the biological interpretability and clinical utility of single-cell data analyses. This is particularly important because high-specificity markers are essential for robust cell type classification, biomarker discovery, therapeutic targeting, and ensuring reproducibility across datasets.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s3">
<title>Discussion</title>
<p>The CITESeQC package is the first software package that assesses the quality of CITE-Seq data in terms of the individual RNAs, surface proteins, and their interactions. For quantitative evaluation, CITESeQC computes Shannon entropy and RNA&#x2013;ADT correlation coefficients&#x2014;two biologically informed metrics. Although entropy itself is designed to quantify expression distribution and is not a direct indicator of technical quality, it becomes informative about data quality when applied to marker genes or proteins. In high-quality CITE-Seq data, well-established cell type markers&#x2014;such as CD3 for T cells or CD19 for B cells&#x2014;should exhibit low entropy, with expression localized to the expected clusters. If these canonical markers instead show unexpectedly high entropy&#x2014;that is, broadly or randomly distributed expression&#x2014;it may suggest technical issues such as ambient RNA contamination that causes marker expression to bleed into unrelated clusters, poor clustering resolution that reflects insufficient transcriptomic signal, or antibody non-specificity or background staining in the ADT layer. Similarly, for a subset of well-characterized, high-expression surface markers, a moderate to strong positive correlation between mRNA and protein levels is expected in biologically consistent and technically sound CITE-Seq data. When known concordant markers exhibit unexpectedly low or erratic correlations, it can suggest technical artifacts such as antibody dropout or mislabeling, droplet barcoding or ambient tag misassignment, or batch effects or sample degradation. CITESeQC does not use these metrics to impose strict thresholds or automatically discard features; instead, it provides them as diagnostic tools to allow users to distinguish between meaningful biological heterogeneity and technical noise. Altogether, we provide a comprehensive set of computational QC measures for CITE-Seq data that assess and quantify various aspects of data quality at both individual RNA and protein levels and in their interactions.</p>
<p>To determine the quality of a CITE-Seq dataset using the quantitative measures provided by CITESeQC, the next step is to determine appropriate cutoff values for each measure. However, establishing some cutoff values is not straightforward. For example, measures correlating RNAs with their corresponding surface proteins depend not only on data quality but also on the translation efficiency of the RNAs. Even for datasets of same quality, translation efficiency can vary across biological contexts due to post-transcriptional regulatory processes such as alternative polyadenylation and competing endogenous RNAs (<xref ref-type="bibr" rid="B2">Fan, et al., 2020</xref>; <xref ref-type="bibr" rid="B9">Kim, et al., 2020</xref>; <xref ref-type="bibr" rid="B13">Park, et al., 2018</xref>). Thus, to assess quality using correlation measures, we recommend comparing the values with those from other CITE-Seq datasets for which users have prior knowledge of data quality. In the future, to perform QC analysis without reference datasets, we plan to collect multiple CITE-Seq datasets of both high and low quality and determine cutoff values directly from the data.</p>
</sec>
<sec sec-type="methods" id="s4">
<title>Methods</title>
<sec id="s4-1">
<title>CITESeQC in user-friendly R markdown</title>
<p>CITESeQC (version 0.9.1) is an R package with minimal prerequisites and is publicly available at <ext-link ext-link-type="uri" xlink:href="https://github.com/sunjie001130/CITESeQC">https://github.com/sunjie001130/CITESeQC</ext-link>. It employs the baseline R packages&#x2014;graphics, stats, and utils&#x2014;making it and easy for users to install. Both the source code and tutorial with example datasets are available to download. The tool can be used in an R script or R Markdown file. The advantage of this design is that it can allow the integration of code, visualizations, and explanations in a single document, which facilitates reproducibility and documentation of data analysis workflows. Additionally, R markdown files do not require familiarity with command-line syntax, like many Linux environment-based software programs.</p>
</sec>
<sec id="s4-2">
<title>Experiment data</title>
<p>PBMCs, which have a single round nucleus, include lymphocytes (T cells, B cells, and NK cells) and monocytes isolated from peripheral blood. We downloaded the dataset from <ext-link ext-link-type="uri" xlink:href="https://www.10xgenomics.com/">https://www.10xgenomics.com/</ext-link>, and CBMCs are derived from umbilical cord blood. They include hematopoietic stem/progenitor cells and immune cells that are more naive than adult PBMCs, making them valuable for studying immune development. We downloaded the dataset from <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866">https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc&#x3d;GSE100866</ext-link>.</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s11">Supplementary Material</xref>; further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="ethics-statement" id="s6">
<title>Ethics statement</title>
<p>Ethical approval was not required for the study involving humans in accordance with the local legislation and institutional requirements. Written informed consent to participate in this study was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>JS: Writing &#x2013; review and editing, Software, Visualization, Methodology, Formal analysis. RM: Conceptualization, Writing &#x2013; review and editing, Software, Visualization. SK: Conceptualization, Writing &#x2013; original draft. KY: Validation, Writing &#x2013; review and editing. HP: Methodology, Conceptualization, Funding acquisition, Writing &#x2013; original draft, Writing &#x2013; review and editing.</p>
</sec>
<ack>
<title>Acknowledgements</title>
<p>This research was supported in part by the University of Pittsburgh Center for Research Computing, RRID:SCR_022735, through the resources provided. Specifically, this work used the HTC cluster, which is supported by NIH award number S10OD028483.</p>
</ack>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fbinf.2025.1630161/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fbinf.2025.1630161/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S1</label>
<caption>
<p>CITESeQC functions based on correlation drawn for the example PBMC CITE-Seq dataset. <bold>(A)</bold> The number of detected genes in each cell is plotted with the number of molecules. The cutoffs for cells of good quality are annotated on the plot between red lines. The correlation coefficient and <italic>p</italic>-value are estimated based on Spearman&#x2019;s correlation. <bold>(B)</bold> The number of detected ADTs in each cell is plotted with the number of ADT molecules. The correlation coefficient and p-value are estimated based on Spearman&#x2019;s correlation. <bold>(C)</bold> The number of molecules identified in the transcriptome is plotted with the percentage of the mitochondrial genes. The correlation trend can test whether the mitochondrial percentage remains constant regardless of the number of identified molecules. CITESeQC functions on the example PBMC dataset showing the distribution of <bold>(D)</bold> CD14 RNA and <bold>(E)</bold> ADTs for CD14 on the UMAP space, respectively. <bold>(F)</bold> Scatterplot plotting all the cells by the number of ADTs for CD14 and the expression level of RNA molecules of CD14. <bold>(G)</bold> Scatterplot plotting all the cells by the total number of all ADTs on the surface and all RNA molecules in the transcriptome.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S2</label>
<caption>
<p>CITESeQC functions based on correlation drawn for the example CBMC CITE-Seq dataset. <bold>(A)</bold> The number of detected genes in each cell is plotted with the number of molecules. The cutoffs for cells of good quality are annotated on the plot between red lines. The correlation coefficient and <italic>p</italic>-value are estimated based on Spearman&#x2019;s correlation. <bold>(B)</bold> The number of detected ADTs in each cell is plotted with the number of ADT molecules. The correlation coefficient and <italic>p</italic>-value are estimated based on Spearman&#x2019;s correlation. <bold>(C)</bold> The number of molecules identified in the transcriptome is plotted with the percentage of the mitochondrial genes. The correlation trend can test whether the mitochondrial percentage remains constant regardless of the number of identified molecules. CITESeQC functions showing and quantifying relative abundance distribution of <bold>(D)</bold> CCR7 RNA, <bold>(E)</bold> ADT-CD14 in the example CBMC dataset, and <bold>(F)</bold> ADT-CD14 in the PBMC dataset. CITESeQC functions showing the distribution of the Shannon entropy values of <bold>(G)</bold> the top 10 marker genes from each cluster and (H) all surface markers across the clusters defined in def_clust() on the example CBMC dataset.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S3</label>
<caption>
<p>Set of scatterplots <bold>(A-O)</bold> each drawn for each cell cluster in the PBMC dataset, showing the correlation between ADT-CD14 abundance and the corresponding gene CD14 expression.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S4</label>
<caption>
<p>CITESeQC functions on the example CBMC dataset showing the distribution of <bold>(A)</bold> CCR7 RNA and <bold>(B)</bold> ADTs for CCR7 on the UMAP space, respectively. <bold>(C)</bold> Scatterplot plotting all the cells by the number of ADTs for CCR7 and the expression level of RNA molecules of CCR7. <bold>(D)</bold> Scatterplot plotting all the cells by the total number of all ADTs on the surface and all RNA molecules in the transcriptome.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S5</label>
<caption>
<p>Set of scatterplots <bold>(A-R)</bold> each drawn for each cell cluster in the CBMC dataset, showing the correlation between ADT-CCR7 abundance and the corresponding gene CCR7 expression.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S6</label>
<caption>
<p>Set of histograms for PBMC data, each showing the distribution of the correlation coefficients in all pairs of ADTs and the corresponding genes across all cell clusters <bold>(A)</bold> or for each cell cluster <bold>(B-P)</bold>.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S7</label>
<caption>
<p>Set of histograms for CBMC data, each showing the distribution of the correlation coefficients in all pairs of ADTs and the corresponding genes for each cell cluster <bold>(A-R)</bold>.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S8</label>
<caption>
<p>Shannon entropy density plot of marker genes across defined clusters of RNAs after 5%, 10%, and 20% shuffling in 10% randomly selected cells of the <bold>(A)</bold> PBMC and <bold>(B)</bold> CBMC datasets with negative control density generated from 1,000 expressed non-DE genes (gray shade).</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Image5.png" id="SM1" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image4.png" id="SM2" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image7.png" id="SM3" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image2.png" id="SM4" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table1.xlsx" id="SM5" mimetype="application/xlsx" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image1.png" id="SM6" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image8.png" id="SM7" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image6.png" id="SM8" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image3.png" id="SM9" mimetype="application/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/129222/overview">Lin Wan</ext-link>, Chinese Academy of Sciences (CAS), China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2897827/overview">Jingyi Cao</ext-link>, Brigham and Women&#x2019;s Hospital, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2953882/overview">Zidong Zhang</ext-link>, Icahn School of Medicine at Mount Sinai, United States</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Butler</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hoffman</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Smibert</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Papalexi</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Satija</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Integrating single-cell transcriptomic data across different conditions, technologies, and species</article-title>. <source>Nat. Biotechnol.</source> <volume>36</volume>, <fpage>411</fpage>&#x2013;<lpage>420</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.4096</pub-id>
<pub-id pub-id-type="pmid">29608179</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Diergaarde</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>H. J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>3&#x27;-UTR shortening contributes to subtype-specific cancer growth by breaking stable ceRNA crosstalk of housekeeping genes</article-title>. <source>Front. Bioeng. Biotechnol.</source> <volume>8</volume>, <fpage>334</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2020.00334</pub-id>
<pub-id pub-id-type="pmid">32411683</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Geh</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Leslie</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rumney</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Reeves</surname>
<given-names>H. L.</given-names>
</name>
<name>
<surname>Bird</surname>
<given-names>T. G.</given-names>
</name>
<name>
<surname>Mann</surname>
<given-names>D. A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Neutrophils as potential therapeutic targets in hepatocellular carcinoma</article-title>. <source>Nat. Rev. Gastroenterol. Hepatol.</source> <volume>19</volume> (<issue>4</issue>), <fpage>257</fpage>&#x2013;<lpage>273</lpage>. <pub-id pub-id-type="doi">10.1038/s41575-021-00568-5</pub-id>
<pub-id pub-id-type="pmid">35022608</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Granja</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Klemm</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>McGinnis</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Kathiria</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Mezger</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Corces</surname>
<given-names>M. R.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Single-cell multiomic analysis identifies regulatory programs in mixed-phenotype acute leukemia</article-title>. <source>Nat. Biotechnol.</source> <volume>37</volume> (<issue>12</issue>), <fpage>1458</fpage>&#x2013;<lpage>1465</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-019-0332-7</pub-id>
<pub-id pub-id-type="pmid">31792411</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grob</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bertolini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Carrara</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lischetti</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Tastanova</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Beisel</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>gExcite: a start-to-end framework for single-cell gene expression, hashing, and antibody analysis</article-title>. <source>Bioinformatics</source> <volume>39</volume> (<issue>5</issue>), <fpage>btad329</fpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btad329</pub-id>
<pub-id pub-id-type="pmid">37220897</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Andersen-Nissen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Mauck</surname>
<given-names>W. M.</given-names>
<suffix>III</suffix>
</name>
<name>
<surname>Zheng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Butler</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Integrated analysis of multimodal single-cell data</article-title>. <source>Cell</source> <volume>184</volume>, <fpage>3573</fpage>&#x2013;<lpage>3587.e29</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2021.04.048</pub-id>
<pub-id pub-id-type="pmid">34062119</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Koga</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bandyadka</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Leshchyk</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Akavoor</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Comprehensive generation, visualization, and reporting of quality control metrics for single-cell RNA sequencing data</article-title>. <source>Nat. Commun.</source> <volume>13</volume> (<issue>1</issue>), <fpage>1688</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-022-29212-9</pub-id>
<pub-id pub-id-type="pmid">35354805</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jacquelot</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Seillet</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Vivier</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Belz</surname>
<given-names>G. T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Innate lymphoid cells and cancer</article-title>. <source>Nat. Immunol.</source> <volume>23</volume> (<issue>3</issue>), <fpage>371</fpage>&#x2013;<lpage>379</lpage>. <pub-id pub-id-type="doi">10.1038/s41590-022-01127-z</pub-id>
<pub-id pub-id-type="pmid">35228695</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Diergaarde</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Tseng</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>H. J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>The microRNA target site landscape is a novel molecular feature associating alternative polyadenylation with immune evasion activity in breast cancer</article-title>. <source>Briefings Bioinforma.</source> <volume>22</volume>, <fpage>bbaa191</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa191</pub-id>
<pub-id pub-id-type="pmid">32844230</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y. R.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wilson</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kramer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Dawson</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Mucosal-associated invariant T cells for cancer immunotherapy</article-title>. <source>Mol. Ther.</source> <volume>31</volume> (<issue>3</issue>), <fpage>631</fpage>&#x2013;<lpage>646</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymthe.2022.11.019</pub-id>
<pub-id pub-id-type="pmid">36463401</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mazzurana</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Czarnewski</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jonsson</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Wigge</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ringn&#xe9;r</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>T. C.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Tissue-specific transcriptional imprinting and heterogeneity in human innate lymphoid cells revealed by full-length single-cell RNA-sequencing</article-title>. <source>Cell Res.</source> <volume>31</volume> (<issue>5</issue>), <fpage>554</fpage>&#x2013;<lpage>568</lpage>. <pub-id pub-id-type="doi">10.1038/s41422-020-00445-x</pub-id>
<pub-id pub-id-type="pmid">33420427</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nettersheim</surname>
<given-names>F. S.</given-names>
</name>
<name>
<surname>Armstrong</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Durant</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Blanco-Dominguez</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Orecchioni</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Titration of 124 antibodies using CITE-Seq on human PBMCs</article-title>. <source>Sci. Rep.</source> <volume>12</volume> (<issue>1</issue>), <fpage>20817</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-24371-7</pub-id>
<pub-id pub-id-type="pmid">36460735</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Park</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Rodriguez</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>3&#x27; UTR shortening represses tumor-suppressor genes in trans by disrupting ceRNA crosstalk</article-title>. <source>Nat. Genet.</source> <volume>50</volume> (<issue>6</issue>), <fpage>783</fpage>&#x2013;<lpage>789</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-018-0118-8</pub-id>
<pub-id pub-id-type="pmid">29785014</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Scheyltjens</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Van Hove</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>De Vlaminck</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kancheva</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bastos</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Vara-P&#xe9;rez</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Single-cell RNA and protein profiling of immune cells from the mouse brain and its border tissues</article-title>. <source>Nat. Protoc.</source> <volume>17</volume> (<issue>10</issue>), <fpage>2354</fpage>&#x2013;<lpage>2388</lpage>. <pub-id pub-id-type="doi">10.1038/s41596-022-00716-4</pub-id>
<pub-id pub-id-type="pmid">35931780</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stoeckius</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hafemeister</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Stephenson</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Houck-Loomis</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chattopadhyay</surname>
<given-names>P. K.</given-names>
</name>
<name>
<surname>Swerdlow</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Simultaneous epitope and transcriptome measurement in single cells</article-title>. <source>Nat. methods</source> <volume>14</volume>, <fpage>865</fpage>&#x2013;<lpage>868</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.4380</pub-id>
<pub-id pub-id-type="pmid">28759029</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stuart</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Srivastava</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Madad</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lareau</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Satija</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Single-cell chromatin state analysis with Signac</article-title>. <source>Nat. Methods</source> <volume>18</volume> (<issue>11</issue>), <fpage>1333</fpage>&#x2013;<lpage>1341</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-021-01282-5</pub-id>
<pub-id pub-id-type="pmid">34725479</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Subramanian</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Alperovich</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Biology-inspired data-driven quality control for scientific discovery in single-cell transcriptomics</article-title>. <source>Genome Biol.</source> <volume>23</volume> (<issue>1</issue>), <fpage>267</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-022-02820-w</pub-id>
<pub-id pub-id-type="pmid">36575523</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zakeri</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hall</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Swadling</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Pallett</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Schmidt</surname>
<given-names>N. M.</given-names>
</name>
<name>
<surname>Diniz</surname>
<given-names>M. O.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Characterisation and induction of tissue-resident gamma delta T-cells to target hepatocellular carcinoma</article-title>. <source>Nat. Commun.</source> <volume>13</volume> (<issue>1</issue>), <fpage>1372</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-022-29012-1</pub-id>
<pub-id pub-id-type="pmid">35296658</pub-id>
</mixed-citation>
</ref>
</ref-list>
</back>
</article>