<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article article-type="methods-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1063130</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2022.1063130</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A high-efficiency differential expression method for cancer heterogeneity using large-scale single-cell RNA-sequencing data</article-title>
<alt-title alt-title-type="left-running-head">Yuan et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2022.1063130">10.3389/fgene.2022.1063130</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Yuan</surname>
<given-names>Xin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1977737/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ma</surname>
<given-names>Shuangge</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/304234/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fa</surname>
<given-names>Botao</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1977051/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wei</surname>
<given-names>Ting</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1176289/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ma</surname>
<given-names>Yanran</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1371562/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Yifan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lv</surname>
<given-names>Wenwen</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Yue</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1057938/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zheng</surname>
<given-names>Junke</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1728583/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Guoqiang</given-names>
</name>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1588192/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Sun</surname>
<given-names>Jing</given-names>
</name>
<xref ref-type="aff" rid="aff8">
<sup>8</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1180305/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yu</surname>
<given-names>Zhangsheng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="aff" rid="aff9">
<sup>9</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1100438/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Bioinformatics and Biostatistics</institution>, <institution>School of Life Sciences and Biotechnology</institution>, <institution>Shanghai Jiao Tong University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>SJTU-Yale Joint Center for Biostatistics and Data Science Organization</institution>, <institution>Shanghai Jiao Tong University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Biostatistics</institution>, <institution>Yale University</institution>, <addr-line>New Haven</addr-line>, <addr-line>CT</addr-line>, <country>United States</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Biochemistry and Molecular Biology</institution>, <institution>School of Basic Medical Sciences</institution>, <institution>Xi&#x2019;an Jiaotong University</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Clinical Research Institute</institution>, <institution>Shanghai Jiao Tong University School of Medicine</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Key Laboratory of Cell Differentiation and Apoptosis of Chinese Ministry of Education</institution>, <institution>Faculty of Basic Medicine</institution>, <institution>Shanghai Jiao Tong University School of Medicine</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<aff id="aff7">
<sup>7</sup>
<institution>State Key Laboratory of Oncogene and Related Gene</institution>, <institution>Shanghai Jiao Tong University School of Medicine</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<aff id="aff8">
<sup>8</sup>
<institution>Shanghai Minimally Invasive Surgery Center</institution>, <institution>Department of General Surgery</institution>, <institution>Ruijin Hospital</institution>, <institution>Shanghai Jiao Tong University School of Medicine</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<aff id="aff9">
<sup>9</sup>
<institution>Center for Biomedical Data Science</institution>, <institution>Translational Science Institute</institution>, <institution>Shanghai Jiao Tong University School of Medicine</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/671469/overview">Hongmei Jiang</ext-link>, Northwestern University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1114704/overview">Keren Li</ext-link>, University of Alabama at Birmingham, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/801274/overview">Lingfei Wang</ext-link>, Broad Institute, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Jing Sun, <email>sj11788@rjh.com.cn</email>; Zhangsheng Yu, <email>yuzhangsheng@sjtu.edu.cn</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Statistical Genetics and Methodology, a section of the journal Frontiers in Genetics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>29</day>
<month>11</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>1063130</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>10</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>14</day>
<month>11</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Yuan, Ma, Fa, Wei, Ma, Wang, Lv, Zhang, Zheng, Chen, Sun and Yu.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Yuan, Ma, Fa, Wei, Ma, Wang, Lv, Zhang, Zheng, Chen, Sun and Yu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Colorectal cancer is a highly heterogeneous disease. Tumor heterogeneity limits the efficacy of cancer treatment. Single-cell RNA-sequencing technology (scRNA-seq) is a powerful tool for studying cancer heterogeneity at cellular resolution. The sparsity, heterogeneous diversity, and fast-growing scale of scRNA-seq data pose challenges to the flexibility, accuracy, and computing efficiency of the differential expression (DE) methods. We proposed HEART (high-efficiency and robust test), a statistical combination test that can detect DE genes with various sources of differences beyond mean expression changes. To validate the performance of HEART, we compared HEART and the other six popular DE methods on various simulation datasets with different settings by two simulation data generation mechanisms. HEART had high accuracy (<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score &#x3e;0.75) and brilliant computational efficiency (less than 2&#xa0;min) on multiple simulation datasets in various experimental settings. HEART performed well on DE genes detection for the PBMC68K dataset quantified by UMI counts and the human brain single-cell dataset quantified by read counts (<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score &#x3d; 0.79, 0.65). By applying HEART to the single-cell dataset of a colorectal cancer patient, we found several potential blood-based biomarkers (CTTN, S100A4, S100A6, UBA52, FAU, and VIM) associated with colorectal cancer metastasis and validated them on additional spatial transcriptomic data of other colorectal cancer patients.</p>
</abstract>
<kwd-group>
<kwd>combination test</kwd>
<kwd>differential analysis</kwd>
<kwd>colorectal cancer</kwd>
<kwd>PBMC68K</kwd>
<kwd>DE gene</kwd>
</kwd-group>
<contract-num rid="cn001">12171318</contract-num>
<contract-num rid="cn002">20JC1410100 21ZR1436300</contract-num>
<contract-sponsor id="cn001">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">Science and Technology Commission of Shanghai Municipality<named-content content-type="fundref-id">10.13039/501100003399</named-content>
</contract-sponsor>
<contract-sponsor id="cn003">Science and Technology Commission of Shanghai Municipality<named-content content-type="fundref-id">10.13039/501100003399</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Colorectal cancer (CRC) was the world&#x2019;s third most common cause of cancer mortality, with more than 850000 deaths annually (<xref ref-type="bibr" rid="B1">Biller and Schrag, 2021</xref>). The Colorectal cancer mortality rate was high in the setting of metastatic disease or recurrence. Predicting tumor response and selecting personalized cancer therapies based on validated biomarkers is important. Tumor heterogeneity is the major obstacle to cancer treatment (<xref ref-type="bibr" rid="B17">Linnekamp et al., 2015</xref>; <xref ref-type="bibr" rid="B6">Eide et al., 2021</xref>). Identifying differential expression genes (DE genes) associated with tumors is critical in investigating cancer heterogeneity (<xref ref-type="bibr" rid="B29">Soneson and Robinson, 2018</xref>; <xref ref-type="bibr" rid="B33">Wang et al., 2019</xref>; <xref ref-type="bibr" rid="B14">Kharchenko, 2021</xref>). Many differential expression analysis methods for bulk-RNA sequencing data focus on the comparison at the mean level and ignore some multi-source heterogeneities. Sequencing technologies develop rapidly, and single-cell RNA-sequencing (scRNA-seq) has become widespread in more experiments. Technological improvements in single-cell RNA sequencing drive novel biological insights and new problems in data analysis. Developments of single-cell RNA-sequencing enable researches on cancer heterogeneity at a high resolution. In contrast with bulk RNA sequencing data, the scRNA-seq data have extensive data sizes, significant fractions of observed zeros, and various gene expression patterns (<xref ref-type="bibr" rid="B29">Soneson and Robinson, 2018</xref>; <xref ref-type="bibr" rid="B33">Wang et al., 2019</xref>; <xref ref-type="bibr" rid="B14">Kharchenko, 2021</xref>). They are large-scale, highly sparse, variable, and complex. Emerging data features unique to scRNA-seq data require novel differential expression analysis methods to detect DE genes (<xref ref-type="bibr" rid="B40">Zheng et al., 2017</xref>; <xref ref-type="bibr" rid="B5">Ding et al., 2020</xref>).</p>
<p>Several DE methods for single-cell data have been proposed to fit the data characteristics in scRNA-seq data. They are two classes of methods in principle: model-based and test-based methods. Model-based DE methods model parametrically with strong assumptions of theoretical distribution of gene expression. Such as, SCDE (<xref ref-type="bibr" rid="B13">Kharchenko et al., 2014</xref>) assumed a mixture of Poisson (dropout) and negative binomial (amplification) distributions for the distribution of genes. DESeq2 (<xref ref-type="bibr" rid="B20">Love et al., 2014</xref>) tests differential expression using negative binomial generalized linear models. MAST (<xref ref-type="bibr" rid="B8">Finak et al., 2015</xref>) fits two-part, generalized linear models for characterizing heterogeneity in scRNA-seq data. Monocle3 (<xref ref-type="bibr" rid="B31">Trapnell et al., 2014</xref>; <xref ref-type="bibr" rid="B26">Qiu et al., 2017</xref>) uses the quasi-Poisson, or negative binomial distribution, to model gene expression counts across cells. NBID (<xref ref-type="bibr" rid="B2">Chen et al., 2018</xref>) calculates each gene&#x2019;s independent dispersion in each group based on the negative binomial distribution. SC2P (<xref ref-type="bibr" rid="B35">Wu et al., 2018</xref>) supposes the gene expression with two phases and employs a zero-inflated Poisson (ZIP) distribution and a lognormal-Poisson (LNP) model to describe gene expression. Thus, the deviation between assumptive and actual distribution incurs algorithm accuracy issues. Moreover, the growth of experimental techniques requires single-cell algorithms to be scalable to handle sheer volumes of data. Large-scale, sparse single-cell data with a prevalence of zero values is challenging to model parameter convergence. Model-based DE methods have limited scalability and an evident diminution of computing performance on large-scale datasets. Statistical tests are widespread substitutions for model-based DE methods, because they have fewer assumptions and lower computing complexity than model-based methods. For example, Seurat, a popular scRNA tool, sets Wilcoxon rank-sum test as the default test to find differentially expressed genes between two groups of cells. However, tests applied for scRNA-seq data are still classical statistical tests and not grounded in biology. Classical parametric statistical tests, such as <italic>t</italic>-test, z-test, and F-test, have poor results due to the extreme skewness caused by the sparsity of the scRNA-seq datasets. Non-parametric tests, such as the Wilcoxon rank-sum test, adapt for the sparsity of scRNA-seq data. But, they have awful accuracy because of the high heterogeneity and complexity of scRNA-seq data. The probabilities of Type 1 errors of the non-parametric tests vary systematically with the increasing heterogeneous variances and remain relatively constant even if the sample size increases (<xref ref-type="bibr" rid="B43">Zimmerman, 2000</xref>). Furthermore, non-parametric tests focus more on locations than the distribution shape, so they cannot sensitively capture various biological differences in scRNA-seq data. Each of these two types of methods has its advantages and limitations. Existing DE methods, whether model-based or test-based, have difficulty balancing accuracy and computational efficiency simultaneously in large-scale single-cell data.</p>
<p>In this study, we present HEART, a scalable combination test for DE analysis of single-cell data. Underlying this test framework, HEART can sensitively detect biological differences in gene expression beyond mean expression shift. We illustrate the benefits of HEART <italic>via</italic> comparing the performances of HEART and the other six DE methods (DESeq2 (<xref ref-type="bibr" rid="B20">Love et al., 2014</xref>), MAST (<xref ref-type="bibr" rid="B8">Finak et al., 2015</xref>), Monocle3 (<xref ref-type="bibr" rid="B31">Trapnell et al., 2014</xref>; <xref ref-type="bibr" rid="B26">Qiu et al., 2017</xref>), NBID (<xref ref-type="bibr" rid="B2">Chen et al., 2018</xref>), SC2P (<xref ref-type="bibr" rid="B35">Wu et al., 2018</xref>), Seurat) on vast simulation experiments based on two simulation generation mechanisms. HEART performs well in accuracy, scalability, statistical robustness and computational efficiency. We demonstrated that HEART performs robustly on two real single-cell datasets underlying different quantification schemes. Furthermore, we applied HEART to a single-cell dataset of a colorectal cancer patient and identified several potentially metastasis-related biomarkers, CTTN, S100A4, S100A6, etc.</p>
</sec>
<sec id="s2">
<title>2 Results</title>
<sec id="s2-1">
<title>2.1 HEART overview</title>
<p>Droplet-based single-cell RNA-sequencing methods measure gene expression on tens or hundreds of thousands of cells at the single-cell level. Gene expression measurements in droplet technology are often in the form of low counts with a large fraction of zero values, and difficult to estimate the exact statistical distribution. We decomposed the gene expression distribution into two parts (<xref ref-type="fig" rid="F1">Figure 1B</xref>): the status of genes (&#x201c;on/off&#x201d;) and the distribution shape of gene &#x201c;on&#x201d; parts (non-zero part). These two parts were closely associated with cell type, cell condition, or other biologic-driven factors. For the first part, the gene expression state ratio was defined as the times of the gene with the positive count in a group of cells. For the gene &#x201c;On&#x201d; part, we described the distribution shape by location parameter (<inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) and scale parameter (<inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) of the &#x201c;On&#x201d; parts (<xref ref-type="fig" rid="F1">Figure 1B</xref>). Therefore, the whole gene expression pattern can be approximated by three parameters: the zero proportion of gene expression (<inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>), the mean of the &#x201c;On&#x201d; parts (<inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>), and the variance of the &#x201c;On&#x201d; parts (<inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>).We assumed that non-DE genes have the same expression distribution shape in pre-defined groups (<xref ref-type="fig" rid="F1">Figure 1A</xref>). We tested three parameters (<inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2002;</mml:mtext>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2002;</mml:mtext>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>) to identify whether a given gene is a DE gene (<xref ref-type="fig" rid="F1">Figure 1C</xref>). Due to low counts, sparsity, and complexity of gene expression, it is challenging to estimate the exact distribution of every gene and construct a suitable statistic for the hypothesis <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> when the theoretical distribution of genes is unknown. Instead of generating the test statistic based on the assumed distribution, we tested the complex null hypothesis <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> using Fisher&#x2019;s (<xref ref-type="bibr" rid="B37">Zappia et al., 2017</xref>) theory of combination test.<disp-formula id="equ1">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>&#x398;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2002;</mml:mtext>
<mml:msub>
<mml:mi>&#x398;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>&#x398;</mml:mi>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2009;</mml:mo>
<mml:mo>&#x2009;</mml:mo>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ2">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msubsup>
<mml:mi>&#x398;</mml:mi>
<mml:mn>0</mml:mn>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ3">
<mml:math id="m13">
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>01</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>02</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>03</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2260;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>An overview of the HEART. <bold>(A)</bold> Diagram of non-differential and differential gene expression patterns. The non-differential genes have the same statistical distribution in different groups. Differential gene expression patterns have several modes with different characteristics. <bold>(B)</bold> The gene expression distribution decomposes into two parts: the gene expression state (&#x201c;On/Off&#x201d;) and the gene expression shape when the gene is &#x201c;On.&#x201d; Two parameters could approximate the distribution shape of the gene &#x201c;On&#x201d; part: the location parameter <inline-formula id="inf11">
<mml:math id="m14">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the scale parameter <inline-formula id="inf12">
<mml:math id="m15">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. <bold>(C)</bold> HEART&#x2019;s combination statistical test structure. Combination test flow chart.</p>
</caption>
<graphic xlink:href="fgene-13-1063130-g001.tif"/>
</fig>
<p>We split the complex null hypothesis <inline-formula id="inf13">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into three simple null hypotheses <inline-formula id="inf14">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and got a new statistic <inline-formula id="inf15">
<mml:math id="m18">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> by combining three individual <italic>p</italic>-values <inline-formula id="inf16">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Each <italic>p</italic>-value <inline-formula id="inf17">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> was obtained by testing the simple null hypothesis <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The chi-square distribution was used to approximate the <italic>p</italic>-value of <inline-formula id="inf19">
<mml:math id="m22">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Underlying this test framework, we easily captured various differences in gene expression and constructed a test for gene expression patterns without many assumptions. Moreover, we only calculated three simple observed test statistics and got the new statistic <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> by combining three individual <italic>p</italic>-values <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We could quickly identify differential expression (DE) genes in millions-scale scRNA data. The computation cost is almost negligible. If the new statistic <inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is larger than the critical value, we reject the null hypothesis and identify the gene as a DE gene. We examined one gene at a time and implemented FDR correction for <italic>p</italic>-values of all genes.</p>
</sec>
<sec id="s2-2">
<title>2.2 HEART validation</title>
<p>HEART proposed a combination test to catch various sources of differences in gene expression patterns between two pre-defined groups. To validate the performance of HEART, we used two simulation data generation mechanisms to compare HEART and other six popular DE methods, including five model-based DE methods (DESeq2, MAST, Monocle3, NBID, and SC2P) and a default test in Seurat (Seurat-W). Simulation details were provided in the &#x201c;Methods&#x201d;. Briefly, the artificial simulation tool, Splatter package (<xref ref-type="bibr" rid="B37">Zappia et al., 2017</xref>), generated datasets in simulation1. Simulation2 datasets used a semi-simulation mechanism based on actual scRNA-seq data (PBMC68K)to create simulation datasets. In both simulations, we varied the number of samples and DE strength for DE genes. We evaluated the ability to identify DE genes, FDR control under the null hypothesis, and computational efficiency under various alternatives by a series of indexes: <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score, TPR, precision, computational time, etc.</p>
<p>In simulation 1, we evaluated the performances of each method on simulation datasets with the same simulation settings. HEART, Monocle3, and NBID perform better than other methods (<xref ref-type="fig" rid="F2">Figure 2A</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S2</xref>). They had higher <inline-formula id="inf24">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores than other methods and achieved a good balance between TPR and precision. Seurat had low precisions, because it was apt to identify the gene with mild signals. DESeq2 maintained high accuracy on medium-scale data (under 10000 cells), but it shows FDR inflation on the large-scale datasets (<xref ref-type="sec" rid="s11">Supplementary Figure S2</xref>). Regarding running time, HEART and Seurat had incomparable advantages (<xref ref-type="fig" rid="F2">Figure 2D</xref>, under 2&#xa0;min on the datasets of 20000 cells). Although NBID and DESeq2 had good accuracy, they required a lot of running time (<xref ref-type="fig" rid="F2">Figure 2D</xref>, more than 1&#xa0;h on the datasets of 20000 cells with 11000 genes).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Simulation results. <bold>(A)</bold> <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores and TPRs of all methods on simulation datasets in Simulation 1 (de.factor &#x3d; 0.5). Plots show <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores (y-axis) and TPRs (y-axis) for different sample sizes (x-axis) for different methods. Colorful points correspond to varied sample sizes. <bold>(B)</bold> <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores and TPRs of all methods on simulation datasets in Simulation 2 (FC &#x3d; 2.5). Plots show <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores (y-axis) and TPRs (y-axis) for different source data (x-axis) for different methods. Colorful points correspond to different source datasets with different cells. <bold>(C)</bold> Semi-simulation data generation mechanism in Simulation 2. <bold>(D)</bold> and <bold>(E)</bold> Computational time of different methods for analyzing data with different sample sizes in Simulation1 and Simulation 2, respectively. The X-axis in <bold>(E)</bold> corresponds to the legend of <bold>(B)</bold>.</p>
</caption>
<graphic xlink:href="fgene-13-1063130-g002.tif"/>
</fig>
<p>In Simulation2, we generated semi-simulation data from real scRNA-seq datasets instead of simulation datasets from artificial protocols (<xref ref-type="fig" rid="F2">Figure 2B</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S3</xref>) (<xref ref-type="bibr" rid="B2">Chen et al., 2018</xref>). We chose each cell subtype with various sample sizes from PBMC68K (<xref ref-type="bibr" rid="B40">Zheng et al., 2017</xref>) as source data to test the stability and scalability of each DE method. HEART, NBID, and Monocle3 have higher <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores in different simulation datasets than other methods. When the sample size was adequate, HEART had good and stable performances, regardless of the statistical characteristics of the datasets. Seurat performed unstably on different datasets. DESeq2, MAST, and SC2P cannot detect DE genes in most scenarios. Importantly, HEART was much more computationally efficient than the other methods (<xref ref-type="fig" rid="F2">Figure 2E</xref>). For the 20000-cells scale datasets, HEART completed computation in about 1&#x2013;2&#xa0;min, but NBID and DESeq2 needed 5&#x2013;7&#xa0;h for the same scale datasets. HEART was applicable to data with the sample size exceeding around millions of cells in theory. We generated null simulations without swapping genes to test the bias in <italic>p</italic>-value estimation for each method (<xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>). HEART controlled the type 1 error well.</p>
<p>Generally, HEART was an accurate, practical and scalable method for DE gene detection. In all semi-simulation scenarios, HEART and NBID performed better than other methods and had relatively stable performances on datasets with various characteristics. Other methods had poor performances on some semi-simulation datasets. As the sample size increases, the performances of HEART, NBID, and Monocle3 become better. However, HEART identified DE genes in the simulation scenarios with weak DE strength of differences, which means HEART was more sensitive than other competing DE methods (<xref ref-type="sec" rid="s11">Supplementary Figure S3</xref>; <xref ref-type="fig" rid="F3">Figure 3</xref>). The performance of NBID was slightly better than HEART in some scenarios, but it took a lot of time to run. (Simulation1 of 20000 cells: NBID: <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score &#x3d; 0.871 running time &#x3d; 6482&#xa0;s; HEART: <inline-formula id="inf31">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score &#x3d; 0.84, running time &#x3d; 52&#xa0;s. Simulation2 of CD8<sup>&#x2b;</sup> cytotoxic T cells: NBID: <inline-formula id="inf32">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score &#x3d; 0.97, running time &#x3d; 16205&#xa0;s; HEART: <inline-formula id="inf33">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score &#x3d; 0.94, running time &#x3d; 94&#xa0;s)</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Comparing all methods: known DE genes among the top ranked DE genes in human brain cells for astrocytes and oligodendrocytes cells.</p>
</caption>
<graphic xlink:href="fgene-13-1063130-g003.tif"/>
</fig>
</sec>
<sec id="s2-3">
<title>2.3 HEART is accurate and robust on read and unique molecular identifier counts data</title>
<p>Read count and unique molecular identifier (UMI) count are two main quantification schemes in single-cell RNA-sequencing technologies and have different statistical characterizations. Some literature (<xref ref-type="bibr" rid="B42">Zilionis et al., 2017</xref>; <xref ref-type="bibr" rid="B2">Chen et al., 2018</xref>; <xref ref-type="bibr" rid="B12">Kashima et al., 2020</xref>; <xref ref-type="bibr" rid="B27">Sarkar and Stephens, 2021</xref>) suggested that read count data have higher count levels, more sparsity and more variability than UMI counts data. To assess the accuracy and robustness of HEART on different quantification mechanisms, we applied HEART and other six DE methods (Seurat, DESeq2, MAST, Monocle3, NBID, and SC2P) on two real single-cell datasets from quantification schemes. A human brain dataset (<xref ref-type="bibr" rid="B4">Darmanis et al., 2015</xref>) (GSE67835) based on read count quantification schemes and a dataset of peripheral blood mononuclear cells (PBMC68K (<xref ref-type="bibr" rid="B40">Zheng et al., 2017</xref>)) quantified by UMI counts.</p>
<sec id="s2-3-1">
<title>2.3.1 Performances on human brain data</title>
<p>Human brain data (GSE67835) (<xref ref-type="bibr" rid="B4">Darmanis et al., 2015</xref>) was a single-cell dataset quantified by read count. It sequenced 466 cells from human cortical tissue containing six sub-cell types. In this human brain data, we used all seven DE methods to identify DE genes on two groups of cells (astrocytes: 62 cells, oligodendrocytes: 38 cells) with 10483 genes. The number of DE genes of different DE methods varied greatly (<xref ref-type="table" rid="T1">Table 1</xref>). At an FDR of 5%, HEART identifies 973 DE genes. For Standard 1, we obtained a list of 41 DE genes (Standard 1) between these two sub-celltypes by comparing purified cell types <italic>via</italic> bulk RNA-seq (<xref ref-type="bibr" rid="B39">Zhang et al., 2014</xref>; <xref ref-type="bibr" rid="B4">Darmanis et al., 2015</xref>). DE genes identified by HEART cover all 41 DE genes in Standard 1. NBID and SC2P also identified 41 DE genes in Standard 1. Still, they identified too many genes as DE genes (NBID: 6116 DE genes, SC2P: 2220 DE genes) and had low specificities (NBID: specificity &#x3d; 0.42, SC2P: specificity &#x3d; 0.79), suggesting potentially false signals. Underlying Standard 2 (top 500 genes) and Standard 3 (top 1,000 genes), HEART had the highest <inline-formula id="inf34">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores and relatively high TPRs and specificities compared to other DE methods (<xref ref-type="table" rid="T1">Table.1</xref>). Moreover, we compared the ability of the 41 DE genes detected in the literature from the top ranked DE gene reported by each method (<xref ref-type="fig" rid="F4">Figure 4</xref>). <xref ref-type="fig" rid="F4">Figure 4</xref> showed that HEART, MAST, and SC2P have higher sensitivity and reliability in capturing true DE signals than the other four DE methods.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>The time consumption, number of DE genes, TPR, specificity, and <inline-formula id="inf35">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score of each method under three different standards (Human brain data).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Method</th>
<th rowspan="2" align="left">Time (s)</th>
<th rowspan="2" align="left">&#x23;(DE genes)</th>
<th colspan="3" align="left">Standard 1</th>
<th colspan="3" align="left">Standard 2</th>
<th colspan="3" align="left">Standard 3</th>
</tr>
<tr>
<th align="left">TPR</th>
<th align="left">Specificity</th>
<th align="left">
<inline-formula id="inf36">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score</th>
<th align="left">TPR</th>
<th align="left">Specificity</th>
<th align="left">
<inline-formula id="inf37">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score</th>
<th align="left">TPR</th>
<th align="left">Specificity</th>
<th align="left">
<inline-formula id="inf38">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">HEART</td>
<td align="left">9.74</td>
<td align="left">973</td>
<td align="left">1.00</td>
<td align="left">0.91</td>
<td align="left">0.08</td>
<td align="left">0.96</td>
<td align="left">0.95</td>
<td align="left">0.65</td>
<td align="left">0.96</td>
<td align="left">0.95</td>
<td align="left">0.65</td>
</tr>
<tr>
<td align="left">Seurat</td>
<td align="left">10.79</td>
<td align="left">2,943</td>
<td align="left">0.93</td>
<td align="left">0.72</td>
<td align="left">0.03</td>
<td align="left">0.83</td>
<td align="left">0.75</td>
<td align="left">0.24</td>
<td align="left">0.83</td>
<td align="left">0.75</td>
<td align="left">0.24</td>
</tr>
<tr>
<td align="left">DESeq2</td>
<td align="left">75.36</td>
<td align="left">5,814</td>
<td align="left">1.00</td>
<td align="left">0.45</td>
<td align="left">0.01</td>
<td align="left">0.99</td>
<td align="left">0.47</td>
<td align="left">0.16</td>
<td align="left">0.99</td>
<td align="left">0.47</td>
<td align="left">0.16</td>
</tr>
<tr>
<td align="left">MAST</td>
<td align="left">82.35</td>
<td align="left">2,155</td>
<td align="left">0.98</td>
<td align="left">0.80</td>
<td align="left">0.04</td>
<td align="left">0.99</td>
<td align="left">0.83</td>
<td align="left">0.37</td>
<td align="left">0.99</td>
<td align="left">0.83</td>
<td align="left">0.37</td>
</tr>
<tr>
<td align="left">Monocle3</td>
<td align="left">60.73</td>
<td align="left">154</td>
<td align="left">0.20</td>
<td align="left">0.99</td>
<td align="left">0.08</td>
<td align="left">0.29</td>
<td align="left">1.00</td>
<td align="left">0.45</td>
<td align="left">0.29</td>
<td align="left">1.00</td>
<td align="left">0.45</td>
</tr>
<tr>
<td align="left">NBID</td>
<td align="left">263.01</td>
<td align="left">6,116</td>
<td align="left">0.80</td>
<td align="left">0.42</td>
<td align="left">0.01</td>
<td align="left">0.99</td>
<td align="left">0.44</td>
<td align="left">0.15</td>
<td align="left">0.99</td>
<td align="left">0.44</td>
<td align="left">0.15</td>
</tr>
<tr>
<td align="left">SC2P</td>
<td align="left">27.28</td>
<td align="left">2,220</td>
<td align="left">1.00</td>
<td align="left">0.79</td>
<td align="left">0.04</td>
<td align="left">0.99</td>
<td align="left">0.83</td>
<td align="left">0.36</td>
<td align="left">0.99</td>
<td align="left">0.83</td>
<td align="left">0.36</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>
<bold>(A)</bold> Heatmap of marker genes for MK3 and MK5 clusters. <bold>(B)</bold> Heatmap of some DE genes detected by HEART. <bold>(C)</bold> Venn diagram of DE genes from 3&#xa0;cell types of tumor and normal cells. <bold>(D)</bold> Violin plots showing some DE genes&#x2019; expression patterns in MK3, MK5 cluster, Epithelial cluster, and fibroblast cluster in tumor and normal tissues, respectively.</p>
</caption>
<graphic xlink:href="fgene-13-1063130-g004.tif"/>
</fig>
<p>In this human brain single-cell dataset quantified by read counts, HEART performs best among seven DE methods. Underlying different standards, HEART always had excellent accuracy for DE gene detection. DESeq2 and NBID had high TPRs, but they maybe detect false DE genes because they identified overabundant genes as DE genes.</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Performances on PBMC68K</title>
<p>PBMC68K (<xref ref-type="bibr" rid="B33">Wang et al., 2019</xref>) was a single-cell UMI count dataset of peripheral blood mononuclear cells (PBMCs) generated by 10X Genomics. T cells were the most abundant cell type in PBMCs and play an essential role in the immune response and immune regulation. Na&#xef;ve T cells and memory T cells had significant differences in functions and features, but they had a large degree of similarity in their overall gene expression (<xref ref-type="sec" rid="s11">Supplementary Material S1</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S1</xref>). The researches on gene expression patterns of the two types of T cells were still inadequate (<xref ref-type="bibr" rid="B19">Liu et al., 2001</xref>; <xref ref-type="bibr" rid="B34">Weng et al., 2012</xref>). We used all seven DE methods (HEART, Seurat, DESeq2, MAST, Monocle3, NBID, and SC2P) to detect DE genes between CD4<sup>&#x2b;</sup> Naive T cells (1873 cells) and CD4<sup>&#x2b;</sup> memory T cells (3,061 cells) from the PBMC68K (<xref ref-type="bibr" rid="B40">Zheng et al., 2017</xref>) dataset with 12406 genes. The number of DE genes identified by each method is very different. HEART, Seurat, DESeq2, MAST, Monocle3, NBID and SC2P selected 692, 676, 459, 36, 431, 1,214, 121 genes, respectively (<xref ref-type="table" rid="T2">Table.2</xref>). For Standard 1, 37 known DE genes from the literature were obtained from various microarray experiments of T cells from both humans and mice (<xref ref-type="bibr" rid="B19">Liu et al., 2001</xref>; <xref ref-type="bibr" rid="B34">Weng et al., 2012</xref>). None of the DE methods in our research fully identified these 37 true DE genes. HEART, Seurat, DESeq2, Monocle3, MAST, NBID, and SC2P captured 12, 16, 0, 28, 9, 4, 20 DE genes (<xref ref-type="table" rid="T2">Table.2</xref>), respectively. NBID detected most DE genes of the standart1, but it identified the most gene (1,214 genes) as DE gene. HEART and Monocle had relatively higher TPR, specificity, and <inline-formula id="inf39">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores than other methods (<xref ref-type="table" rid="T2">Table.2</xref>). Note that some genes with very low expression, such as gene FAS, TNF, (average UMI count in two groups: 0.017, 0.028), were only detected by HEART and the NBID. Underlying Standard 2 and Standard 3, HEART had higher <inline-formula id="inf40">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score (0.77 and 0.79) than other test-based DE methods and most model-based DE methods (<xref ref-type="table" rid="T2">Table.2</xref>). HEART had high TPRs while ensuring high specificity. Moreover, on the datasets of thousands of cells, HEART only needed 40&#xa0;s to run, while DESeq2 and NBID took an hour. In this application of real-data DE analysis, HEART had good accuracies assessed by different standards and spends a short running time. Especially compared with the test-based method, Seurat, HEART performed better. Compared with model-based DE methods, HEART had higher <inline-formula id="inf41">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores than most model-based DE methods and ran faster than all model-based DE methods.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>The time consumption, number of DE genes, sensitivity, specificity, and <inline-formula id="inf42">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score of each method under three different standards (PBMC68K).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Method</th>
<th rowspan="2" align="left">Time (s)</th>
<th rowspan="2" align="left">&#x23;(DE genes)</th>
<th colspan="3" align="left">Standard 1</th>
<th colspan="3" align="left">Standard 2</th>
<th colspan="3" align="left">Standard 3</th>
</tr>
<tr>
<th align="left">TPR</th>
<th align="left">Specificity</th>
<th align="left">
<inline-formula id="inf43">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score</th>
<th align="left">TPR</th>
<th align="left">Specificity</th>
<th align="left">
<inline-formula id="inf44">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score</th>
<th align="left">TPR</th>
<th align="left">Specificity</th>
<th align="left">
<inline-formula id="inf45">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">HEART</td>
<td align="left">40</td>
<td align="left">692</td>
<td align="left">0.54</td>
<td align="left">0.95</td>
<td align="left">0.05</td>
<td align="left">0.92</td>
<td align="left">0.98</td>
<td align="left">0.77</td>
<td align="left">0.67</td>
<td align="left">1.00</td>
<td align="left">0.79</td>
</tr>
<tr>
<td align="left">Seurat</td>
<td align="left">7</td>
<td align="left">676</td>
<td align="left">0.24</td>
<td align="left">0.95</td>
<td align="left">0.03</td>
<td align="left">0.71</td>
<td align="left">0.97</td>
<td align="left">0.60</td>
<td align="left">0.52</td>
<td align="left">0.99</td>
<td align="left">0.62</td>
</tr>
<tr>
<td align="left">DESeq2</td>
<td align="left">3,345</td>
<td align="left">459</td>
<td align="left">0.32</td>
<td align="left">0.96</td>
<td align="left">0.05</td>
<td align="left">0.83</td>
<td align="left">1.00</td>
<td align="left">0.87</td>
<td align="left">0.46</td>
<td align="left">1.00</td>
<td align="left">0.63</td>
</tr>
<tr>
<td align="left">MAST</td>
<td align="left">753</td>
<td align="left">36</td>
<td align="left">0.00</td>
<td align="left">1.00</td>
<td align="left">0.00</td>
<td align="left">0.00</td>
<td align="left">1.00</td>
<td align="left">0.00</td>
<td align="left">0.00</td>
<td align="left">1.00</td>
<td align="left">0.00</td>
</tr>
<tr>
<td align="left">Monocle3</td>
<td align="left">290</td>
<td align="left">431</td>
<td align="left">0.43</td>
<td align="left">0.97</td>
<td align="left">0.07</td>
<td align="left">0.81</td>
<td align="left">1.00</td>
<td align="left">0.87</td>
<td align="left">0.43</td>
<td align="left">1.00</td>
<td align="left">0.60</td>
</tr>
<tr>
<td align="left">NBID</td>
<td align="left">3,905</td>
<td align="left">1,214</td>
<td align="left">0.76</td>
<td align="left">0.90</td>
<td align="left">0.04</td>
<td align="left">0.97</td>
<td align="left">0.94</td>
<td align="left">0.56</td>
<td align="left">0.85</td>
<td align="left">0.97</td>
<td align="left">0.77</td>
</tr>
<tr>
<td align="left">SC2P</td>
<td align="left">223</td>
<td align="left">121</td>
<td align="left">0.11</td>
<td align="left">0.99</td>
<td align="left">0.05</td>
<td align="left">0.23</td>
<td align="left">1.00</td>
<td align="left">0.37</td>
<td align="left">0.12</td>
<td align="left">1.00</td>
<td align="left">0.22</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s2-4">
<title>2.4 HEART identifies metastatic colorectal cancer biomarkers</title>
<p>Colorectal cancer (CRC) is the most commonly diagnosed cancers in the world.20% of individuals with newly diagnosed colorectal cancer have metastatic disease upon presentation, and another 25% of those who initially have localized illness will eventually acquire metastases (<xref ref-type="bibr" rid="B1">Biller and Schrag, 2021</xref>). Distant metastasis was the main cause of death in patients with colorectal cancer, but the exact metastasis mechanism was still unknown. (<xref ref-type="bibr" rid="B39">Zhang et al., 2014</xref>). ScRNA-seq technology provided a new opportunity to investigate the association between genes and the mechanism of tumor initiation, progression, and metastasis (<xref ref-type="bibr" rid="B16">Lawson et al., 2018</xref>). Therefore, we applied HEART in a single-cell dataset (containing three sub-datasets: PBMC, normal tissue, and tumor tissue) of a stage III colorectal cancer patient. We used HEART to identify DE genes between tumor and normal fibroblasts and between tumor and normal epitheliums. Furthermore, we found two subpopulations of megakaryocytes (MKs) (<xref ref-type="bibr" rid="B32">Wang et al., 2021</xref>) in the PBMCs and utilized HEART to detect 207 DE genes on the2&#xa0;MK subtype clusters to characterize functional differences and underlying molecular mechanisms. Highly expressed genes in the cluster MK3 (<xref ref-type="bibr" rid="B28">Satija et al., 2015</xref>; <xref ref-type="bibr" rid="B30">Stuart et al., 2019</xref>; <xref ref-type="bibr" rid="B7">Fa et al., 2021</xref>; <xref ref-type="bibr" rid="B32">Wang et al., 2021</xref>), such as CCL5, TUBB1, MYL9, HIST1H2AC, etc. (<xref ref-type="fig" rid="F4">Figure 4A</xref>), were associated with early platelet production. Another subpopulation, MK5, with high CD74 and PLAC8 might be a less mature MK population. Moreover, we observed that many DE genes between MK3 and MK5 cells overlap with DE genes between tumor and normal epitheliums and DE genes between tumor and normal fibroblasts (<xref ref-type="fig" rid="F4">Figure 4B</xref>
<bold>,</bold> <xref ref-type="fig" rid="F4">Figure 4C</xref>). They had similar expression patterns in the MK5 cells, tumor epitheliums, and fibroblasts (<xref ref-type="fig" rid="F4">Figure 4D</xref>) and were related to colorectal cancer progression or metastasis. The Violin plot showed similar distribution shapes of CTTN in MK5 cells and epithelial tumor cells. The gene CTTN has been reported overexpressed in various cancers, including colorectal cancer, and had the function of promoting tumor cell migration (<xref ref-type="bibr" rid="B22">Luo et al., 2006</xref>; <xref ref-type="bibr" rid="B11">Jing et al., 2016</xref>; <xref ref-type="bibr" rid="B38">Zhang et al., 2017</xref>). Furthermore, S100A4 (<xref ref-type="bibr" rid="B9">Helfman et al., 2005</xref>; <xref ref-type="bibr" rid="B24">Nader et al., 2020</xref>), S100A6 (<xref ref-type="bibr" rid="B15">Komatsu et al., 2000</xref>), UBA52 (<xref ref-type="bibr" rid="B41">Zhou et al., 2019</xref>), FAU (<xref ref-type="bibr" rid="B25">Pickard et al., 2011</xref>), and VIM (<xref ref-type="bibr" rid="B23">Luque-Garcia et al., 2010</xref>; <xref ref-type="bibr" rid="B36">Xu et al., 2017</xref>), etc. Also had similar expression patterns between MK5 cells and tumor epitheliums and fibroblasts. S100A4 and S100A6 play an important role in tumor metastases, including colorectal tumor metastasis (<xref ref-type="bibr" rid="B15">Komatsu et al., 2000</xref>). Recent studies have proved that the bloodstream plays a crucial role in tumor metastasis and tumor immune escape (<xref ref-type="bibr" rid="B16">Lawson et al., 2018</xref>). The cooperation of hematopoiesis, megakaryocytes, and platelet production aided CTCs in escaping the immune system and disseminating within the bloodstream to establish distant organ metastasis. We also validated the expression pattern of these genes in the spatial transcriptome data of two other stage IV colorectal cancer patients (<xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>), which showed spatial patterns of high expression in cancer cells.</p>
<p>Consequently, we supposed that a series of genes, CTTN, S100A4, S100A6, etc., were potential colorectal cancer metastasis biomarkers. The MK5 subpopulation with highly-expressed above potential biomarkers might be a cluster related to colorectal cancer metastasis and have a circulating tumor cell (CTC). The exact mechanism between MK5 and colorectal tumor metastasis warranted further investigation.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Discussion</title>
<p>Differential expression analysis was a crucial topic in cancer heterogeneity analysis. The new characteristics of scRNA-seq data put forward new challenges for the DE method. Model-based methods methods&#x2019; performances are unstable due to strong assumptions and lacked scalability facing the explosive growing scale of single-cell data. Test-based methods were more scalable than model-based methods. However, the accuracy existing in these test-based methods was relatively too low in identifying DE genes due to the sparsity, variability, and complexity of scRNA-seq data. HEART proposed a bio-driven combination test framework that captures comprehensive differences by integrating differential information about gene expression ratio, gene expression level, and variability. Unlike most competitors assuming theoretical statistical distribution (some are complex mixture distributions) for gene expression, HEART used a combination framework of simple statistical tests to test the two parts of the gene expression. We compared HEART and the other six DE methods on various simulation datasets with different sample sizes and DE strength of DE genes. HEART achieved an excellent trade-off between accuracy and computational efficiency. It had higher <inline-formula id="inf46">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores than all classical test method and most model-based methods and can be apt to expand to ultra-large-scale of datasets. Moreover, HEART had robust performances facing datasets with different statistical characteristics, while DESeq2 and Monocle3 had unstable performances on diverse datasets. Although NBID acted better than HEART in some scenarios, its computational cost on large-scale data sets may not be worth the increased accuracy it provided (A dataset with 20000 cells and 10000 genes: NBID: <inline-formula id="inf47">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores &#x3d; 0.871, running time &#x3d; 6482&#xa0;s; HEART: <inline-formula id="inf48">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores &#x3d; 0.84, running time &#x3d; 52&#xa0;s). To demonstrate the accuracy, robustness, and generality of HERAT, we compared HEART and the other six DE methods on two single-cell datasets from different quantitative mechanisms. HEART had high accuracy and low specificity on two various quantification forms data. We applied HEART and other six methods to identify DE genes between CD4<sup>&#x2b;</sup> Naive T cells and CD4<sup>&#x2b;</sup> memory T cells from the PBMC 68k dataset quantified by UMI counts. HEART had less computational time and higher TPRs and <inline-formula id="inf49">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores than other methods under different standards. Moreover, HEART had a good ability to capture the DE gene with low expression counts level, which is easily omitted in most DE analysis methods. HEART identified gene FAS and TNF, verified DE genes in literature, with lower gene expression ratios and expression counts in this PBMC68K dataset. On human brain single-cell datasets quantified by read count, HEART had the highest accuracy and controls false-positive rates well. It achieved a good balance between sensitivity and specificity. In addition, applying HEART on two subpopulations of megakaryocytes, we found several potential cancer biomarkers (CTTN, S100A4, S100A6, UBA52, FAU, and VIM, etc.) associated with colorectal cancer progression and metastasis in literature. HEART also detected these DE genes between normal and tumor epitheliums and fibroblasts. We observed the expression pattern of these genes showed spatial patterns of high expression in cancer cells in the spatial transcriptome data of two other stage IV colorectal cancer patients. Megakaryocytes are the source of platelets. Whereas the contribution of platelets to cancer procession and metastasis has been extensively characterized (<xref ref-type="bibr" rid="B3">Cho et al., 2012</xref>), the interaction of tumor cells with platelets and megakaryocytes during the metastatic cascade was less well-defined. Currently, the role of megakaryocytes during metastasis was starting to be appreciated. Some studies have demonstrated that increasing number of megakaryocytes in patients with cancer metastases (<xref ref-type="bibr" rid="B10">Huang et al., 2019</xref>; <xref ref-type="bibr" rid="B21">Lucotti and Muschel, 2020</xref>). In recent years, studies about platelet and megakaryocytes transcriptome at the single-cell level indicated that megakaryocytes and platelets are much more diverse than before. They fulfilled their distinct functions by utilizing heterogeneous subpopulations (<xref ref-type="bibr" rid="B14">Kharchenko, 2021</xref>; <xref ref-type="bibr" rid="B18">Liu et al., 2021</xref>). Keeping with these studies, we found that an MK subpopulation correlated with colorectal cancer metastasis. Furthermore, the proven colorectal cancer biomarkers had similar gene expression patterns in MK5 subpopulation cells and tumor epitheliums. The correlation between the MK5 subpopulation and colorectal cancer metastasis may be closer than previous studies. Of course, the comprehensive link and the underlying molecular basis between MKs, platelets, and tumor cells need more experiments and research to clarify. HEART has two main limitations: first, it is sensitive to sample size similar to other DE methods and performs poorly on small datasets (<inline-formula id="inf50">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>60</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Second, HEART is only designed for comparisons between two groups, and expansion to comparisons between multiple groups requires more research.</p>
<p>In summary, HEART is a competitive DE method for scRNA-seq data, which maintains high accuracy, unrivaled computational efficiency, and strong robustness across diverse scRNA-seq datasets.</p>
</sec>
<sec id="s4">
<title>4 Materials and methods</title>
<sec id="s4-1">
<title>4.1 Datasets</title>
<p>We used three actual scRNA-seq datasets in applications. The PBMC68K is available from <ext-link ext-link-type="uri" xlink:href="https://support.10xgenomics.com/single-cellgene-expression/datasets">https://support.10xgenomics.com/single-cellgene-expression/datasets</ext-link>. The human brain dataset can be obtained by R package SC2P or the GEO database repository under accession code GSE67835. The scRNA-seq data of one stage III colorectal cancer patient have been deposited in the OMIX, China National Center for Bioinformation / Beijing Institute of Genomics, Chinese Academy of Sciences (<ext-link ext-link-type="uri" xlink:href="https://ngdc.cncb.ac.cn/search/?dbId=&amp;q=PRJCA012584">https://ngdc.cncb.ac.cn/omix: accession no. OMIX002120</ext-link>). The spatial transcriptomic data of two colorectal cancer patients are available from (<ext-link ext-link-type="uri" xlink:href="http://www.cancerdiversity.asia/scCRLM">http://www.cancerdiversity.asia/scCRLM</ext-link>).</p>
</sec>
<sec id="s4-2">
<title>4.2 Simulation settings</title>
<p>We used two simulation data generation mechanisms to generate scenarios with different settings. Each design had 20 replications. The popular artificial protocol, Splatter (<xref ref-type="bibr" rid="B37">Zappia et al., 2017</xref>), generated simulation datasets in simulation 1. Each scenario contained 10000 genes (1000 DE genes and 10000 non-DE genes) and two underlying subpopulations. We varied the number of samples (1,000, 2000, 5,000, 10000, 20000) and DE strength for DE genes (de.factor &#x3d; 0.3, 0.5). De.factor is the differential expression factor produced from a log-normal distribution. A high de.factor can result in the strong DE strength of DE genes between groups (More details of parameters in <xref ref-type="sec" rid="s11">Supplementary Material S1</xref>).</p>
<p>Simulation 2 adopted a semi-simulation mechanism based on actual scRNA datasets to recover the multimodality and biological characteristic complexity of actual scRNA-seq data (<xref ref-type="fig" rid="F2">Figure 2C</xref>, <xref ref-type="sec" rid="s11">Supplementary Material S1</xref>) (<xref ref-type="bibr" rid="B2">Chen et al., 2018</xref>). First, we randomly divided the real scRNA-seq dataset into two parts regarded as two groups of cells. The second step was to create differentially expressed genes. We ranked the mean counts of all genes of the second group of cells and chose 200 genes, starting with the one having a mean count just above <inline-formula id="inf51">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We selected another 200 genes beginning with the mean count just above <inline-formula id="inf52">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Then, we swapped the gene expression of these two equal numbers sets of selected genes in the second group of cells and got a simulation dataset with 2&#xa0;cell groups with a known DE genes list. The parameter FC controlled the DE strength of DE genes between groups. We considered three DE strengths of DE genes: weak (FC &#x3d; 1.5), moderate (FC &#x3d; 2), and strong (FC &#x3d; 2.5). In simulation 2, we chose PBMC68K (<xref ref-type="bibr" rid="B40">Zheng et al., 2017</xref>) as source data. PBMC68K consisted of transcription profiles of &#x2212;68000 peripheral blood mononuclear cells and had 11 different cell subtypes with sample sizes ranging from &#x2212;90 to &#x2212;20000 (more details in <xref ref-type="sec" rid="s11">Supplementary Material S1</xref>). We generated three simulation scenarios for each subtype of cells with three different levels (weak, moderate, and strong) of difference to test the sensitivity of detecting the DE genes.</p>
</sec>
<sec id="s4-3">
<title>4.3 DE genes list</title>
<p>All DE gene lists in simulation datasets were artificially set. We calculated all method performance indices according to known DE gene lists. Due to the unattainability of the whole accurate DE genes list of different cell groups in real single-cell data, we used different standards to set three potential DE gene lists and calculated all method performance indices.</p>
<p>Standard 1. Known DE genes from the literature.</p>
<p>Standard 2. The top 500 genes are ranked by the chosen number of times by all methods.</p>
<p>Standard 3. The top 1,000 genes are ranked by the chosen number of times by all methods.</p>
<p>For Standard 1, we collected dozens of known DE genes from various experiments based on bulk RNA-seq in the literature (<xref ref-type="bibr" rid="B19">Liu et al., 2001</xref>; <xref ref-type="bibr" rid="B34">Weng et al., 2012</xref>; <xref ref-type="bibr" rid="B39">Zhang et al., 2014</xref>; <xref ref-type="bibr" rid="B4">Darmanis et al., 2015</xref>). They were partial genes of the whole true DE genes between different cell clusters. For Standard 2 and 3, we ranked all genes&#x2019; chosen number of times by all methods and set the top 500 and 1,000 genes as potential DE genes between different cell clusters.</p>
</sec>
<sec id="s4-4">
<title>4.4 Index</title>
<p>On the basis of the DE gene list in 2.3, we calculated a series of indices: <inline-formula id="inf53">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores, true positive rate (TPR, recall), false discovery rate (FDR), and time consumption to assess the performance of all methods. All indices were presented as the average value of 20 replications.</p>
</sec>
<sec id="s4-5">
<title>4.5 Method details</title>
<p>For this <inline-formula id="inf54">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>01</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we compared the positive expression ratio of the gene <inline-formula id="inf55">
<mml:math id="m58">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the two groups of cells. The total numbers of two groups of cells are <inline-formula id="inf56">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf57">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. And the numbers of positive expressions of the gene <inline-formula id="inf58">
<mml:math id="m61">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the two groups of cells are <inline-formula id="inf59">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2211;</mml:mo>
<mml:mi mathvariant="bold">I</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf60">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2211;</mml:mo>
<mml:mi mathvariant="bold">I</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. <inline-formula id="inf61">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the UMI count of the gene <inline-formula id="inf62">
<mml:math id="m65">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of cell <inline-formula id="inf63">
<mml:math id="m66">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the group <inline-formula id="inf64">
<mml:math id="m67">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1,2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf65">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the gene <inline-formula id="inf66">
<mml:math id="m69">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>&#x2019;s positive expression proportion in the group <inline-formula id="inf67">
<mml:math id="m70">
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf68">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the estimator of <inline-formula id="inf69">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Hence, the positive expression ratios of the gene <inline-formula id="inf70">
<mml:math id="m73">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in <italic>group 1</italic> and <italic>group 2</italic> are <inline-formula id="inf71">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf72">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, respectively.<disp-formula id="equ4">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>01</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ5">
<mml:math id="m77">
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>&#x2217;</mml:mo>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>&#x2217;</mml:mo>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ6">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">w</mml:mi>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext> </mml:mtext>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2217;</mml:mo>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ7">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>Z</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>01</mml:mn>
</mml:msub>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>In terms of hypotheses <inline-formula id="inf73">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>02</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf74">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">H</mml:mi>
<mml:mn>03</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, only the &#x201c;<bold>On</bold>&#x201d; state of each gene is involved in calculations. For hypothesis <inline-formula id="inf75">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>02</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we used the Student&#x2019;s t-test to determine whether the two groups differ significantly on the central location of gene <inline-formula id="inf76">
<mml:math id="m83">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>&#x2019;s expression of the &#x201c;<bold>On</bold>&#x201d; state.<disp-formula id="equ8">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>02</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mtext> </mml:mtext>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ9">
<mml:math id="m85">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ10">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mo>&#x007C;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x007C;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>02</mml:mn>
</mml:msub>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>
<inline-formula id="inf77">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the mean of the gene <inline-formula id="inf78">
<mml:math id="m88">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the group <inline-formula id="inf79">
<mml:math id="m89">
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> on the positive part (&#x201c;on&#x201d; state). <inline-formula id="inf80">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> is the estimator of the <inline-formula id="inf81">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf82">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the estimator of the <inline-formula id="inf83">
<mml:math id="m93">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, which is the variance of the gene <inline-formula id="inf84">
<mml:math id="m94">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the group <inline-formula id="inf85">
<mml:math id="m95">
<mml:mrow>
<mml:msup>
<mml:mi>g</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi mathvariant="normal">s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> &#x2018;on&#x2019; part.Where, <inline-formula id="inf86">
<mml:math id="m96">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
<p>For this <inline-formula id="inf87">
<mml:math id="m97">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>03</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we used the Brown&#x2013;Forsythe test to test the equality of scattering of gene <inline-formula id="inf88">
<mml:math id="m98">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>&#x2019;s positive expression.<disp-formula id="equ11">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>03</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>;</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2260;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ12">
<mml:math id="m100">
<mml:mrow>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>G</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>G</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>G</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2002;</mml:mtext>
<mml:mrow>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>G</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>x</mml:mi>
<mml:mo>&#x223c;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ13">
<mml:math id="m101">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn>03</mml:mn>
</mml:msub>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>Where, <inline-formula id="inf89">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>x</mml:mi>
<mml:mo>&#x223c;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in <inline-formula id="inf90">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>x</mml:mi>
<mml:mo>&#x223c;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the median of the g-th subgroup. Then we performed statistical tests on each null hypothesis <inline-formula id="inf91">
<mml:math id="m104">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. The <italic>p</italic>-value of each test is recorded as <inline-formula id="inf92">
<mml:math id="m105">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We obtained a new statistic <inline-formula id="inf93">
<mml:math id="m106">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> by combining three individual <italic>p</italic>-values <inline-formula id="inf94">
<mml:math id="m107">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the statistics for each null hypothesis <inline-formula id="inf95">
<mml:math id="m108">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.<disp-formula id="equ14">
<mml:math id="m109">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:munderover>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>
<inline-formula id="inf96">
<mml:math id="m110">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> follows the <inline-formula id="inf97">
<mml:math id="m111">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c7;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> distribution. If <inline-formula id="inf98">
<mml:math id="m112">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is independent, <inline-formula id="inf99">
<mml:math id="m113">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msup>
<mml:mi>&#x3c7;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The degree of freedom of <inline-formula id="inf100">
<mml:math id="m114">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is not equal to 6 in most scenarios because of the correlation of <inline-formula id="inf101">
<mml:math id="m115">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. To solve this problem, we obtained the freedom which is close to the real data distribution by <inline-formula id="inf102">
<mml:math id="m116">
<mml:mrow>
<mml:munder>
<mml:mi mathvariant="italic">sup</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>f</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> (more details in<xref ref-type="sec" rid="s11">Supplementary Material S1</xref>).</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/<xref ref-type="sec" rid="s11">Supplementary Material</xref>.</p>
</sec>
<sec id="s6">
<title>Ethics statement</title>
<p>The studies involving human participants were reviewed and approved by Ethics Committee of Ruijin Hospital affiliated to Shanghai Jiao Tong University School of Medicine. The patients/participants provided their written informed consent to participate in this study.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>XY, SM, and ZY designed the HEART. ZY, JS, GC, JZ, and YZ obtained funding and provided the essential materials. XY, BF, TW, YM, and WL preprocessed the datasets. XY implemented the methods, the simulation framework, and the method comparison. All authors read and approved the final paper.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>This study was supported by National Natural Science Foundation of China (ID: 12171318), Shanghai Science and Technology Commission (ID: 20JC1410100), Shanghai Science and Technology Commission (ID: 21ZR1436300), Three-year plan of Shanghai public health system construction (ID: GWV-10.1-XK05), Shanghai Jiao Tong University STAR Grant (ID: 20190102), Medical Engineering Cross Fund of Shanghai Jiao Tong University (ID: YG2021QN50).</p>
</sec>
<ack>
<p>We gratefully thank PhD student Shuya Cui, Kaiqi Zhang, and Congwen Xiao for their helpful discussion.</p>
</ack>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2022.1063130/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2022.1063130/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Biller</surname>
<given-names>L. H.</given-names>
</name>
<name>
<surname>Schrag</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Diagnosis and treatment of metastatic colorectal cancer: A review</article-title>. <source>JAMA</source> <volume>325</volume>, <fpage>669</fpage>&#x2013;<lpage>685</lpage>. <pub-id pub-id-type="doi">10.1001/jama.2021.0106</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Easton</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Finkelstein</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>UMI-count modeling and differential expression analysis for single-cell RNA sequencing</article-title>. <source>Genome Biol.</source> <volume>19</volume>, <fpage>70</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-018-1438-9</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cho</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Bottsford-Miller</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Vasquez</surname>
<given-names>H. G.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zand</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kroll</surname>
<given-names>M. H.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>Platelets increase the proliferation of ovarian cancer cells</article-title>. <source>Blood</source> <volume>120</volume>, <fpage>4869</fpage>&#x2013;<lpage>4872</lpage>. <pub-id pub-id-type="doi">10.1182/blood-2012-06-438598</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Darmanis</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sloan</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Enge</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Caneda</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Shuer</surname>
<given-names>L. M.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>A survey of human brain transcriptome diversity at the single cell level</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A.</source> <volume>112</volume>, <fpage>7285</fpage>&#x2013;<lpage>7290</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1507125112</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Adiconis</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Simmons</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Kowalczyk</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Hession</surname>
<given-names>C. C.</given-names>
</name>
<name>
<surname>Marjanovic</surname>
<given-names>N. D.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Systematic comparison of single-cell and single-nucleus RNA-sequencing methods</article-title>. <source>Nat. Biotechnol.</source> <volume>38</volume>, <fpage>737</fpage>&#x2013;<lpage>746</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-020-0465-8</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eide</surname>
<given-names>P. W.</given-names>
</name>
<name>
<surname>Moosavi</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Eilertsen</surname>
<given-names>I. A.</given-names>
</name>
<name>
<surname>Brunsell</surname>
<given-names>T. H.</given-names>
</name>
<name>
<surname>Langerud</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Berg</surname>
<given-names>K. C. G.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Metastatic heterogeneity of the consensus molecular subtypes of colorectal cancer</article-title>. <source>NPJ Genom. Med.</source> <volume>6</volume>, <fpage>59</fpage>. <pub-id pub-id-type="doi">10.1038/s41525-021-00223-7</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fa</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Johnston</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>GapClust is a light-weight approach distinguishing rare cells from voluminous single cell expression profiles</article-title>. <source>Nat. Commun.</source> <volume>12</volume>, <fpage>4197</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-021-24489-8</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Finak</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>McDavid</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yajima</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gersuk</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Shalek</surname>
<given-names>A. K.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Mast: A flexible statistical framework for assessing transcriptional changes and characterizing heterogeneity in single-cell RNA sequencing data</article-title>. <source>Genome Biol.</source> <volume>16</volume>, <fpage>278</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-015-0844-5</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Helfman</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>E. J.</given-names>
</name>
<name>
<surname>Lukanidin</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Grigorian</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>The metastasis associated protein S100A4: Role in tumour progression and metastasis</article-title>. <source>Br. J. Cancer</source> <volume>92</volume>, <fpage>1955</fpage>&#x2013;<lpage>1958</lpage>. <pub-id pub-id-type="doi">10.1038/sj.bjc.6602613</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Presence of intra-tumoral CD61&#x2b; megakaryocytes predicts poor prognosis in non-small cell lung cancer</article-title>. <source>Transl. Lung Cancer Res.</source> <volume>8</volume>, <fpage>323</fpage>&#x2013;<lpage>331</lpage>. <pub-id pub-id-type="doi">10.21037/tlcr.2019.08.23</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jing</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Cortactin promotes cell migration and invasion through upregulation of the dedicator of cytokinesis 1 expression in human colorectal cancer</article-title>. <source>Oncol. Rep.</source> <volume>36</volume>, <fpage>1946</fpage>&#x2013;<lpage>1952</lpage>. <pub-id pub-id-type="doi">10.3892/or.2016.5058</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kashima</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sakamoto</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kaneko</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Seki</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Suzuki</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Suzuki</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Single-cell sequencing techniques from individual to multiomics analyses</article-title>. <source>Exp. Mol. Med.</source> <volume>52</volume>, <fpage>1419</fpage>&#x2013;<lpage>1427</lpage>. <pub-id pub-id-type="doi">10.1038/s12276-020-00499-2</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kharchenko</surname>
<given-names>P. V.</given-names>
</name>
<name>
<surname>Silberstein</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Scadden</surname>
<given-names>D. T.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Bayesian approach to single-cell differential expression analysis</article-title>. <source>Nat. Methods</source> <volume>11</volume>, <fpage>740</fpage>&#x2013;<lpage>742</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.2967</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kharchenko</surname>
<given-names>P. V.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The triumphs and limitations of computational methods for scRNA-seq</article-title>. <source>Nat. Methods</source> <volume>18</volume>, <fpage>723</fpage>&#x2013;<lpage>732</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-021-01171-x</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Komatsu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kobune-Fujiwara</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Andoh</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ishiguro</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hunai</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Suzuki</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2000</year>). <article-title>Increased expression of S100A6 at the invading fronts of the primary lesion and liver metastasis in patients with colorectal adenocarcinoma</article-title>. <source>Br. J. Cancer</source> <volume>83</volume>, <fpage>769</fpage>&#x2013;<lpage>774</lpage>. <pub-id pub-id-type="doi">10.1054/bjoc.2000.1356</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lawson</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Kessenbrock</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>R. T.</given-names>
</name>
<name>
<surname>Pervolarakis</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Werb</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Tumour heterogeneity and metastasis at single-cell resolution</article-title>. <source>Nat. Cell Biol.</source> <volume>20</volume>, <fpage>1349</fpage>&#x2013;<lpage>1360</lpage>. <pub-id pub-id-type="doi">10.1038/s41556-018-0236-7</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Linnekamp</surname>
<given-names>J. F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Medema</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Vermeulen</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Colorectal cancer heterogeneity and targeted therapy: A case for molecular disease subtypes</article-title>. <source>Cancer Res.</source> <volume>75</volume>, <fpage>245</fpage>&#x2013;<lpage>249</lpage>. <pub-id pub-id-type="doi">10.1158/0008-5472.CAN-14-2240</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The heterogeneity of megakaryocytes and platelets and implications for <italic>ex vivo</italic> platelet generation</article-title>. <source>Stem Cells Transl. Med.</source> <volume>10</volume>, <fpage>1614</fpage>&#x2013;<lpage>1620</lpage>. <pub-id pub-id-type="doi">10.1002/sctm.21-0264</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Prabhu</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Young</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Becker</surname>
<given-names>K. G.</given-names>
</name>
<name>
<surname>Munson</surname>
<given-names>P. J.</given-names>
</name>
<etal/>
</person-group> (<year>2001</year>). <article-title>Augmentation in expression of activation-induced genes differentiates memory from naive CD4&#x2b; T cells and is a molecular mechanism for enhanced cellular response of memory CD4&#x2b; T cells</article-title>. <source>J. Immunol.</source> <volume>166</volume>, <fpage>7335</fpage>&#x2013;<lpage>7344</lpage>. <pub-id pub-id-type="doi">10.4049/jimmunol.166.12.7335</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Love</surname>
<given-names>M. I.</given-names>
</name>
<name>
<surname>Huber</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Anders</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2</article-title>. <source>Genome Biol.</source> <volume>15</volume>, <fpage>550</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-014-0550-8</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lucotti</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Muschel</surname>
<given-names>R. J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Platelets and metastasis: New implications of an old interplay</article-title>. <source>Front. Oncol.</source> <volume>10</volume>, <fpage>1350</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2020.01350</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>X. M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2006</year>). <article-title>Amplification and overexpression of CTTN (EMS1) contribute to the metastasis of esophageal squamous cell carcinoma by promoting cell migration and anoikis resistance</article-title>. <source>Cancer Res.</source> <volume>66</volume>, <fpage>11690</fpage>&#x2013;<lpage>11699</lpage>. <pub-id pub-id-type="doi">10.1158/0008-5472.CAN-06-1484</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luque-Garcia</surname>
<given-names>J. L.</given-names>
</name>
<name>
<surname>Martinez-Torrecuadrada</surname>
<given-names>J. L.</given-names>
</name>
<name>
<surname>Epifano</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Canamero</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Babel</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Casal</surname>
<given-names>J. I.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Differential protein expression on the cell surface of colorectal cancer cells associated to tumor metastasis</article-title>. <source>Proteomics</source> <volume>10</volume>, <fpage>940</fpage>&#x2013;<lpage>952</lpage>. <pub-id pub-id-type="doi">10.1002/pmic.200900441</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nader</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Guillon</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Petit</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Boissard</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Franconi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Blandin</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>S100A4 is a biomarker of tumorigenesis, EMT, invasion, and colonization of host organs in experimental malignant mesothelioma</article-title>. <source>Cancers (Basel)</source> <volume>12</volume>, <fpage>E939</fpage>. <pub-id pub-id-type="doi">10.3390/cancers12040939</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pickard</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Mourtada-Maarabouni</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>G. T.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Candidate tumour suppressor fau regulates apoptosis in human cells: An essential role for bcl-G</article-title>. <source>Biochim. Biophys. Acta</source> <volume>1812</volume>, <fpage>1146</fpage>&#x2013;<lpage>1153</lpage>. <pub-id pub-id-type="doi">10.1016/j.bbadis.2011.04.009</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qiu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hill</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Packer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y. A.</given-names>
</name>
<name>
<surname>Trapnell</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Single-cell mRNA quantification and differential analysis with Census</article-title>. <source>Nat. Methods</source> <volume>14</volume>, <fpage>309</fpage>&#x2013;<lpage>315</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.4150</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sarkar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Stephens</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Separating measurement and expression models clarifies confusion in single-cell RNA sequencing analysis</article-title>. <source>Nat. Genet.</source> <volume>53</volume>, <fpage>770</fpage>&#x2013;<lpage>777</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-021-00873-4</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Satija</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Farrell</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Gennert</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Schier</surname>
<given-names>A. F.</given-names>
</name>
<name>
<surname>Regev</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Spatial reconstruction of single-cell gene expression data</article-title>. <source>Nat. Biotechnol.</source> <volume>33</volume>, <fpage>495</fpage>&#x2013;<lpage>502</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.3192</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Soneson</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Robinson</surname>
<given-names>M. D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Bias, robustness and scalability in single-cell differential expression analysis</article-title>. <source>Nat. Methods</source> <volume>15</volume>, <fpage>255</fpage>&#x2013;<lpage>261</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.4612</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stuart</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Butler</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hoffman</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hafemeister</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Papalexi</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Mauck</surname>
<given-names>W. M.</given-names>
<suffix>3rd</suffix>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Comprehensive integration of single-cell data</article-title>. <source>Cell</source> <volume>177</volume>, <fpage>1888</fpage>&#x2013;<lpage>1902</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2019.05.031</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trapnell</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Cacchiarelli</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Grimsby</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pokharel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Morse</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>The dynamics and regulators of cell fate decisions are revealed by pseudotemporal ordering of single cells</article-title>. <source>Nat. Biotechnol.</source> <volume>32</volume>, <fpage>381</fpage>&#x2013;<lpage>386</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.2859</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Decoding human megakaryocyte development</article-title>. <source>Cell Stem Cell</source> <volume>28</volume>, <fpage>535</fpage>&#x2013;<lpage>549</lpage>. <pub-id pub-id-type="doi">10.1016/j.stem.2020.11.006</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Nelson</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Nabavi</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Comparative analysis of differential gene expression analysis tools for single-cell RNA sequencing data</article-title>. <source>BMC Bioinforma.</source> <volume>20</volume>, <fpage>40</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-019-2599-6</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weng</surname>
<given-names>N. P.</given-names>
</name>
<name>
<surname>Araki</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Subedi</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>The molecular basis of the memory T cell response: Differential gene expression and its epigenetic regulation</article-title>. <source>Nat. Rev. Immunol.</source> <volume>12</volume>, <fpage>306</fpage>&#x2013;<lpage>315</lpage>. <pub-id pub-id-type="doi">10.1038/nri3173</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Stitzel</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Two-phase differential expression analysis for single cell RNA-seq</article-title>. <source>Bioinformatics</source> <volume>34</volume>, <fpage>3340</fpage>&#x2013;<lpage>3348</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty329</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Chan</surname>
<given-names>P. Y.</given-names>
</name>
<name>
<surname>Shaw</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hines</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>The novel association of circulating tumor cells and circulating megakaryocytes with prostate cancer prognosis</article-title>. <source>Clin. Cancer Res.</source> <volume>23</volume>, <fpage>5112</fpage>&#x2013;<lpage>5122</lpage>. <pub-id pub-id-type="doi">10.1158/1078-0432.CCR-16-3081</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zappia</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Phipson</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Oshlack</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Splatter: Simulation of single-cell RNA sequencing data</article-title>. <source>Genome Biol.</source> <volume>18</volume>, <fpage>174</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-017-1305-0</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Jing</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Cortactin promotes colorectal cancer cell proliferation by activating the EGFR-MAPK pathway</article-title>. <source>Oncotarget</source> <volume>8</volume>, <fpage>1541</fpage>&#x2013;<lpage>1554</lpage>. <pub-id pub-id-type="doi">10.18632/oncotarget.13652</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Sloan</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Bennett</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Scholze</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>O&#x27;Keeffe</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>An RNA-sequencing transcriptome and splicing database of glia, neurons, and vascular cells of the cerebral cortex</article-title>. <source>J. Neurosci.</source> <volume>34</volume>, <fpage>11929</fpage>&#x2013;<lpage>11947</lpage>. <pub-id pub-id-type="doi">10.1523/JNEUROSCI.1860-14.2014</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname>
<given-names>G. X. Y.</given-names>
</name>
<name>
<surname>Terry</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Belgrader</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ryvkin</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bent</surname>
<given-names>Z. W.</given-names>
</name>
<name>
<surname>Wilson</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Massively parallel digital transcriptional profiling of single cells</article-title>. <source>Nat. Commun.</source> <volume>8</volume>, <fpage>14049</fpage>. <pub-id pub-id-type="doi">10.1038/ncomms14049</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>LUCAT1 promotes colorectal cancer tumorigenesis by targeting the ribosomal protein L40-MDM2-p53 pathway through binding with UBA52</article-title>. <source>Cancer Sci.</source> <volume>110</volume>, <fpage>1194</fpage>&#x2013;<lpage>1207</lpage>. <pub-id pub-id-type="doi">10.1111/cas.13951</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zilionis</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Nainys</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Veres</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Savova</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Zemmour</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>A. M.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Single-cell barcoding and sequencing using droplet microfluidics</article-title>. <source>Nat. Protoc.</source> <volume>12</volume>, <fpage>44</fpage>&#x2013;<lpage>73</lpage>. <pub-id pub-id-type="doi">10.1038/nprot.2016.154</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zimmerman</surname>
<given-names>D. W.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>Statistical significance levels of nonparametric tests biased by heterogeneous variances of treatment groups</article-title>. <source>J. Gen. Psychol.</source> <volume>127</volume>, <fpage>354</fpage>&#x2013;<lpage>364</lpage>. <pub-id pub-id-type="doi">10.1080/00221300009598589</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>