<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1380928</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2024.1380928</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Bioinformatics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>bootGSEA: a bootstrap and rank aggregation pipeline for multi-study and multi-omics enrichment analyses</article-title>
<alt-title alt-title-type="left-running-head">Hemandhar Kumar et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2024.1380928">10.3389/fbinf.2024.1380928</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Hemandhar Kumar</surname>
<given-names>Shamini</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2649521/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Tapken</surname>
<given-names>Ines</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2639353/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kuhn</surname>
<given-names>Daniela</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2660675/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Claus</surname>
<given-names>Peter</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/133461/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Jung</surname>
<given-names>Klaus</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/795233/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Institute for Animal Genomics</institution>, <institution>University of Veterinary Medicine</institution>, <institution>Foundation</institution>, <addr-line>Hannover</addr-line>, <country>Germany</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Center for Systems Neuroscience (ZSN)</institution>, <institution>University of Veterinary Medicine</institution>, <institution>Foundation</institution>, <addr-line>Hannover</addr-line>, <country>Germany</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>SMATHERIA gGmbH&#x2014;Non-Profit Biomedical Research Institute</institution>, <addr-line>Hannover</addr-line>, <country>Germany</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Clinic for Conservative Dentistry, Periodontology and Preventive Dentistry</institution>, <institution>Hannover Medical School</institution>, <addr-line>Hannover</addr-line>, <country>Germany</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/74339/overview">Dapeng Wang</ext-link>, Imperial College London, United Kingdom</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/116259/overview">Federica Chiappori</ext-link>, National Research Council (CNR), Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/596453/overview">Alessandro Palma</ext-link>, Sapienza University of Rome, Italy</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Klaus Jung, <email>klaus.jung@tiho-hannover.de</email>
</corresp>
<fn fn-type="equal" id="fn001">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors share first authorship</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>04</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>4</volume>
<elocation-id>1380928</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>02</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>18</day>
<month>03</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Hemandhar Kumar, Tapken, Kuhn, Claus and Jung.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Hemandhar Kumar, Tapken, Kuhn, Claus and Jung</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>
<bold>Introduction:</bold> Gene set enrichment analysis (GSEA) subsequent to differential expression analysis is a standard step in transcriptomics and proteomics data analysis. Although many tools for this step are available, the results are often difficult to reproduce because set annotations can change in the databases, that is, new features can be added or existing features can be removed. Finally, such changes in set compositions can have an impact on biological interpretation.</p>
<p>
<bold>Methods:</bold> We present bootGSEA, a novel computational pipeline, to study the robustness of GSEA. By repeating GSEA based on bootstrap samples, the variability and robustness of results can be studied. In our pipeline, not all genes or proteins are involved in the different bootstrap replicates of the analyses. Finally, we aggregate the ranks from the bootstrap replicates to obtain a score per gene set that shows whether it gains or loses evidence compared to the ranking of the standard GSEA. Rank aggregation is also used to combine GSEA results from different omics levels or from multiple independent studies at the same omics level.</p>
<p>
<bold>Results:</bold> By applying our approach to six independent cancer transcriptomics datasets, we showed that bootstrap GSEA can aid in the selection of more robust enriched gene sets. Additionally, we applied our approach to paired transcriptomics and proteomics data obtained from a mouse model of spinal muscular atrophy (SMA), a neurodegenerative and neurodevelopmental disease associated with multi-system involvement. After obtaining a robust ranking at both omics levels, both ranking lists were combined to aggregate the findings from the transcriptomics and proteomics results. Furthermore, we constructed the new R-package &#x201c;bootGSEA,&#x201d; which implements the proposed methods and provides graphical views of the findings. Bootstrap-based GSEA was able in the example datasets to identify gene or protein sets that were less robust when the set composition changed during bootstrap analysis.</p>
<p>
<bold>Discussion:</bold> The rank aggregation step was useful for combining bootstrap results and making them comparable to the original findings on the single-omics level or for combining findings from multiple different omics levels.</p>
</abstract>
<kwd-group>
<kwd>bootstrap analysis</kwd>
<kwd>gene set enrichment analysis</kwd>
<kwd>multi-omics analysis</kwd>
<kwd>proteomics</kwd>
<kwd>rank aggregation</kwd>
<kwd>transcriptomics</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Integrative Bioinformatics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Set-based enrichment methods are an integral part of the analysis of high-throughput expression data, such as those originating from transcriptomics or proteomics experiments. Enrichment methods allow the identification of molecular pathways, Gene Ontology (GO) terms, and other gene sets that might play a role in the disease of interest. Most enrichment methods are subsequently conducted for differential expression analysis; that is, they rely on the ranking of genes after comparing two groups of samples. Statistical tests are used to determine whether the genes of a particular set are disproportionately highly enriched among the differentially expressed genes (DEGs) (<xref ref-type="bibr" rid="B6">Beissbarth and Speed, 2004</xref>; <xref ref-type="bibr" rid="B49">Subramanian et al., 2005</xref>; <xref ref-type="bibr" rid="B1">Ackermann and Strimmer, 2009</xref>). This contrasts with self-contained gene set tests, which are based on subsets of expression data related to a particular gene set (<xref ref-type="bibr" rid="B19">Goeman et al., 2004</xref>; <xref ref-type="bibr" rid="B26">Hummel et al., 2008</xref>; <xref ref-type="bibr" rid="B28">Jung et al., 2011</xref>; <xref ref-type="bibr" rid="B5">Bayerlov&#xe1; et al., 2015</xref>).</p>
<p>Set information for enrichment analyses is usually obtained from public databases, for example, on molecular pathways or GO terms. The most commonly used databases are the &#x201c;Reactome pathway knowledgebase&#x201d; (<xref ref-type="bibr" rid="B14">Fabregat et al., 2016</xref>) (<ext-link ext-link-type="uri" xlink:href="https://reactome.org/">https://reactome.org/</ext-link>), &#x201c;Kyoto Encyclopedia of Genes and Genomes&#x201d; (<xref ref-type="bibr" rid="B29">Kanehisa and Goto, 2000</xref>) (KEGG; <ext-link ext-link-type="uri" xlink:href="https://www.genome.jp/kegg/">https://www.genome.jp/kegg/</ext-link>), &#x201c;WikiPathways&#x201d; database (<xref ref-type="bibr" rid="B31">Kelder et al., 2012</xref>) (<ext-link ext-link-type="uri" xlink:href="https://www.wikipathways.org">https://www.wikipathways.org</ext-link>), &#x201c;GO&#x201d; knowledgebase (<xref ref-type="bibr" rid="B9">Consortium, 2004</xref>) (<ext-link ext-link-type="uri" xlink:href="http://geneontology.org/">http://geneontology.org/</ext-link>), and the &#x201c;Molecular Signatures Database&#x201d; (<xref ref-type="bibr" rid="B36">Liberzon et al., 2011</xref>) (MSigDB; <ext-link ext-link-type="uri" xlink:href="https://www.gsea-msigdb.org/gsea/msigdb">https://www.gsea-msigdb.org/gsea/msigdb</ext-link>). In the GO database, a particular GO term comprises a set of genes that can be assigned to a biological process (BP), molecular function (MF), or cellular component (CC).</p>
<p>The contents of the databases are curated either automatically by computer algorithms or manually by experts (<xref ref-type="bibr" rid="B9">Consortium, 2004</xref>). Specifically, WikiPathways provides community-based curation by registered contributors (<xref ref-type="bibr" rid="B31">Kelder et al., 2012</xref>). An example of a database where curation is done both ways, manually and computationally, is the MSigDB. Furthermore, pathway membership can be experimentally validated or predicted computationally. However, none of the modes of curation can prevent the uncertainties remaining regarding the membership of individual genes to a particular pathway (<xref ref-type="bibr" rid="B17">Gillis and Pavlidis, 2013</xref>). This is important because all enrichment analyses rely on the correctness of the database information, and the results of enrichment analyses would change if features of a set are removed or added. This can especially happen when the database information is retrieved at different times. For example, the GO database contained 42,442 terms classified as valid and 5,287 classified as obsolete in January 2024. Two months before, only 4,889 terms were classified as obsolete, meaning that nearly 400 terms would have to be reconsidered when a Gene-set enrichment analysis (GSEA) is performed after January 2024. In addition, the WikiPathways database reports roughly between 100 and 700 edits per month. Furthermore, in the KEGG database, complete pathways can be merged, leading to a large number of changes. For example, the KEGG pathway map00471 has been deleted and then added to the KEGG pathway map00470 (&#x201c;D-amino acid metabolism&#x201d;).</p>
<p>In this work, we present bootGSEA as a novel bootstrap approach to repeatedly sample subsets of pathways or other gene sets to study whether a result remains significant when the set composition is changed. The ranking lists of the gene sets obtained from each bootstrap replicate were aggregated using a score that can be used for a new ranking list. The analyst can then compare the original ranking with the bootstrap-based ranking list to study whether the association of a pathway or GO term with the disease gains or loses evidence. A similar approach was proposed by <xref ref-type="bibr" rid="B46">Schmid et al. (2016),</xref> who generated a robustness score for each gene set using random subsets of gene sets. In contrast to their approach, our method results in a new ranking of gene sets that can be helpful in aggregating findings from different independent studies or different omics levels. Thus, our approach for multi-omics follows the idea of aggregating the different omics levels after performing primary analysis on the individual levels first. This way of multi-omics analysis has also been implemented in other studies. For example, <xref ref-type="bibr" rid="B52">Wang et al. (2014)</xref> fused networks that were first derived on individual omics levels, <xref ref-type="bibr" rid="B54">Xiong et al. (2012)</xref> integrated genetic and transcriptomics results in a joint score, and <xref ref-type="bibr" rid="B30">Kang et al. (2022)</xref> fitted neural networks from individual omics levels and merged them into a joint model.</p>
<p>We also demonstrated that this method is useful for aggregating the results of enrichment analysis from different omics domains in the same experiment. We applied our new bootstrap pipeline to a single-omics scenario (transcriptomics only) comprising six independent renal cancer datasets. This example was used to show how our bootstrap pipeline can help study the robustness of GSEA when comparing results from multiple independent datasets from studies on a similar research question. In addition, we analyzed our multi-omics kidney data (transcriptomics and proteomics) from our research consortium on spinal muscular atrophy (SMA). The data were obtained from a SMA mouse model to demonstrate the usefulness of our approach when comparing GSEA results between different omics levels. SMA is a monogenic disease caused by the mutation or deletion of the survival motor neuron 1 (<italic>SMN1</italic>) gene (<xref ref-type="bibr" rid="B35">Lefebvre et al., 1995</xref>). The disease is characterized by the degeneration of motoneurons, with the subsequent atrophy of skeletal muscles to muscular atrophy since the SMN affects all tissues, which also include non-skeletal muscles. Moreover, SMA is a multi-system disorder that also affects peripheral organs, such as the kidney (<xref ref-type="bibr" rid="B4">Allardyce et al., 2020</xref>). Three treatment methods are available, all increasing SMN expression. The SMN is expressed ubiquitously and has several important cellular functions, including snRNP assembly, R-loop resolution, and regulation of the actin cytoskeleton and translation (<xref ref-type="bibr" rid="B23">Hensel et al., 2020</xref>). Therefore, SMA is a highly complex disease with expected dysregulations in pathways in several cell types and on several molecular levels.</p>
</sec>
<sec sec-type="methods" id="s2">
<title>2 Methods</title>
<p>In this section, we describe the analysis pipeline, including the approach for the bootstrap step used to repeatedly analyze different random subsets of the data. Furthermore, the rank aggregation step and examples of transcriptomics and proteomics data are presented. The complete workflow is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. All the analyses were implemented in the R programming environment [<ext-link ext-link-type="uri" xlink:href="http://www.r-project.org">www.r-project.org</ext-link>, version 4.2 (<xref ref-type="bibr" rid="B43">R Core Team, 2022</xref>)].</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Workflow of the analysis pipeline for bootstrap enrichment analysis at the single-omics level and the steps for single- and multi-omics rank aggregation. For the data obtained from each omics domain (transcriptomics and proteomics data), Gene-set enrichment analysis (GSEA) is performed after differential expression analysis using all data (original standard analysis). Next, gene set enrichment analysis is performed on subsets of data based on bootstrap samples. Finally, the different rankings of the GO terms or pathways can be integrated not only within each omics level but also across all omics levels. The robustness of gene set enrichment analysis can be studied by comparing the original results with either single- or multi-omics results.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g001.tif"/>
</fig>
<sec id="s2-1">
<title>2.1 Differential expression analysis and bootstrap method for Gene-set enrichment analysis</title>
<p>Prior to enrichment analysis, differential expression analysis was performed on normalized expression data from different groups of interest (for example, disease vs. control) to obtain the ranks of genes and proteins. For the microarray and proteomics data, normalization was performed using the quantile method (<xref ref-type="bibr" rid="B7">Bolstad et al., 2003</xref>; <xref ref-type="bibr" rid="B51">V&#xe4;likangas et al., 2018</xref>; <xref ref-type="bibr" rid="B55">Zhao et al., 2020</xref>), and for the RNA-seq data, the internal normalization of the DESeq method of the R package &#x201c;DESeq2&#x201d; was used (<xref ref-type="bibr" rid="B37">Love et al., 2014</xref>). Differential analysis of the proteomics data was performed using the functionality of the R package &#x201c;limma&#x201d; (<xref ref-type="bibr" rid="B45">Ritchie et al., 2015</xref>), and differential analysis of the RNA-seq data was done using the R package &#x201c;DESeq2&#x201d; (spinal muscular atrophy data) and microarray data using &#x201c;limma&#x201d; (<xref ref-type="bibr" rid="B45">Ritchie et al., 2015</xref>) (renal cell carcinoma datasets).Next, enrichment analysis based on the results of the differential expression analysis was initially performed using the complete dataset, that is, using all genes or proteins that were assigned to a particular gene set according to the database information. We denote this as the original analysis. Gene sets were defined based on pathway data from the KEGG, Reactome, and WikiPathways databases, as well as GO terms. KEGG, Reactome, and WikiPathways enrichment analyses were performed using the R package &#x201c;clusterProfiler&#x201d; (<xref ref-type="bibr" rid="B53">Wu et al., 2021</xref>), and GO term enrichment analyses were performed using the R package &#x201c;topGO&#x201d; (<xref ref-type="bibr" rid="B2">Alexa and Rahnenf&#xfc;hrer, 2009</xref>). The enrichment analyses in &#x201c;clusterProfiler&#x201d; implement the methods described by <xref ref-type="bibr" rid="B49">Subramanian et al. (2005),</xref> which are independent of thresholds for differentially expressed features. In contrast, the &#x201c;topGO&#x201d; method uses a threshold but has the advantage of incorporating information about the hierarchy of GO terms.</p>
<p>To study the variability and robustness of the outcome of the enrichment analysis, <italic>B</italic> bootstrap samples were drawn using only 95% of all the genes in each run. The genes were randomly drawn without replacement. GSEA was repeated for these randomly selected subsets of genes <italic>B</italic> times, where <italic>B</italic> is the number of times the whole set was resampled. The composition of the defined gene sets changed when bootstrapping from the gene sets was performed. Consequently, the composition of the gene sets was different for each bootstrap run. Thus, the effect of individual genes was also reflected in this approach. From this bootstrap procedure, <italic>B</italic> ranking lists of the gene sets were obtained.</p>
</sec>
<sec id="s2-2">
<title>2.2 Rank aggregation for single- and multi-omics analyses</title>
<p>The resulting enrichment analysis from the <italic>B</italic> bootstrap runs with <italic>B</italic> lists of GO terms was aggregated using the R package &#x201c;RobustRankAggreg&#x201d; (<xref ref-type="bibr" rid="B33">Kolde et al., 2012</xref>). The aggregation score for each pathway was obtained based on the number of occurrences and the ranks from each bootstrap run. The aggregation score was further transformed into a rank for each pathway. To study the robustness of the original findings, the rank obtained from the aggregated score can be compared with the actual analysis, that is, the analysis without a bootstrap step.</p>
<p>For multi-omics data, original and bootstrap enrichment analyses were first performed for each omics domain, resulting in one list of aggregated ranks per domain. The aggregated scores from each omics domain were further aggregated. In one of our data examples, enrichment analysis was performed separately for the transcriptomics and proteomics data, and both ranking lists were aggregated into one final ranking list. Thus, the final multi-omics score for each pathway or GO term was obtained.</p>
</sec>
<sec id="s2-3">
<title>2.3 R package: bootGSEA</title>
<p>The workflow shown in <xref ref-type="fig" rid="F1">Figure 1</xref> has been compiled and implemented in the new R package &#x201c;bootGSEA&#x201d; available at the GitHub repository (<ext-link ext-link-type="uri" xlink:href="https://github.com/klausjung-hannover/bootGSEA">https://github.com/klausjung-hannover/bootGSEA</ext-link>). The input requires the results of differential expression analysis. The package currently has eight functions. The functions boot.GO and boot.pathway are used for GO and pathway enrichment analyses, respectively, of the complete data (original analysis) and of bootstrapped data samples, and aggr.boot.GO and aggr.boot.pathway are used for the rank aggregation of pathways obtained from the former functions. In the functions boot.GO and boot.pathway, the user can specify which percentage of features should be drawn during the bootstrap runs. Furthermore, to understand the robustness of pathways at a broader level, we used a multi-omics approach by aggregating ranks from individual omics levels using the aggr.multiomics function. In addition, three functions are provided to visualize these results and study the robustness of the findings. Examples of these visualizations are presented in <xref ref-type="sec" rid="s3">Section 3</xref>. The function compareRank was implemented to compare the original and bootstrapped results at a single-omics level, the function plotRank, for both single- and multi-omics levels, and the function histDiff to understand the rank difference between original and bootstrap analyses.</p>
</sec>
<sec id="s2-4">
<title>2.4 Example data 1: transcriptomics data from a renal cancer study</title>
<p>The gene expression profiles of renal cell carcinoma (RCC) datasets (GSE6344 (<xref ref-type="bibr" rid="B10">Copland, 2008</xref>), GSE14762 (<xref ref-type="bibr" rid="B13">Dykema and Furge, 2009</xref>), GSE11024 (<xref ref-type="bibr" rid="B34">Kort, 2008</xref>), GSE14994 (<xref ref-type="bibr" rid="B48">Signoretti and Beroukhim, 2010</xref>), GSE53757 (<xref ref-type="bibr" rid="B27">John Copland et al., 2014</xref>), and GSE15641 (<xref ref-type="bibr" rid="B20">Jones et al., 2005</xref>)) were downloaded from the Gene Expression Omnibus (GEO) database using the GEOquery (<xref ref-type="bibr" rid="B11">Davis and Meltzer, 2007</xref>) package in the R platform. Detailed information about the datasets, including platform and sample size, is given in <xref ref-type="table" rid="T1">Table 1</xref>. Differential expression analysis was performed using the limma (<xref ref-type="bibr" rid="B45">Ritchie et al., 2015</xref>) package in R. DEGs were screened based on FDR-adjusted <italic>p</italic>-values <inline-formula id="inf1">
<mml:math id="m1">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>0.05 as the cut-off value for all datasets. The results from the differential analysis were further analyzed following the pipeline shown in <xref ref-type="fig" rid="F1">Figure 1</xref> at the single-omics level.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Renal cell carcinoma (RCC) datasets with accession numbers from the GEO database, sample sizes in the normal and tumor groups, and references to the publication of the original analysis.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Accession no.</th>
<th align="left">Platform</th>
<th align="left">n Normal</th>
<th align="left">n Tumor</th>
<th align="left">Reference</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">GSE6344</td>
<td align="left">GPL96</td>
<td align="right">10</td>
<td align="right">10</td>
<td align="left">
<xref ref-type="bibr" rid="B10">Copland (2008)</xref>
</td>
</tr>
<tr>
<td align="left">GSE14762</td>
<td align="left">GPL4866</td>
<td align="right">12</td>
<td align="right">10</td>
<td align="left">
<xref ref-type="bibr" rid="B13">Dykema and Furge (2009)</xref>
</td>
</tr>
<tr>
<td align="left">GSE11024</td>
<td align="left">GPL6671</td>
<td align="right">12</td>
<td align="right">67</td>
<td align="left">
<xref ref-type="bibr" rid="B34">Kort (2008)</xref>
</td>
</tr>
<tr>
<td align="left">GSE14994</td>
<td align="left">GPL3921</td>
<td align="right">11</td>
<td align="right">59</td>
<td align="left">
<xref ref-type="bibr" rid="B48">Signoretti and Beroukhim (2010)</xref>
</td>
</tr>
<tr>
<td align="left">GSE53757</td>
<td align="left">GPL570</td>
<td align="right">72</td>
<td align="right">72</td>
<td align="left">
<xref ref-type="bibr" rid="B27">John Copland et al. (2014)</xref>
</td>
</tr>
<tr>
<td align="left">GSE15641</td>
<td align="left">GPL96</td>
<td align="right">23</td>
<td align="right">69</td>
<td align="left">
<xref ref-type="bibr" rid="B20">Jones et al. (2005)</xref>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-5">
<title>2.5 Example data 2: transcriptomics and proteomics data from a study on SMA</title>
<p>Severe (&#x201c;Taiwanese&#x201d;) SMA mice [(FVB.Cg-Tg (SMN2)2Hung Smn1tm1Hung/J)] (<xref ref-type="bibr" rid="B25">Hsieh-Li et al., 2000</xref>) were bred by an established breeding scheme (<xref ref-type="bibr" rid="B44">Riessland et al., 2010</xref>), resulting in a litter of half SMA mice (tgSMN2tg/0, mSmn1&#x2212;/&#x2212;) and half control mice (tgSMN2tg/0, mSmn1&#x2b;/&#x2212;). For analysis, the animals were euthanized by decapitation on pre-symptomatic post-natal day 3 (P3), and a tail tip biopsy was taken for genotyping, as described previously (<xref ref-type="bibr" rid="B24">Hensel et al., 2012</xref>). The kidneys were dissected, snap-frozen in liquid nitrogen, and stored at &#x2212;80&#xb0;C until analysis. Tissue was lyzed either for RNA-seq or for proteomics analyses, respectively, as described previously (<xref ref-type="bibr" rid="B4">Allardyce et al., 2020</xref>; <xref ref-type="bibr" rid="B22">Hensel et al., 2021</xref>), using total organ and total RNA. All animal experiments were conducted in accordance with the German Animal Welfare law and approved by the Ministry of Food, Agriculture, and Consumer Protection of Lower Saxony (LAVES file no. 19/3309).</p>
<p>The datasets including 54,146 mRNA transcripts and 7,959 proteins from the kidney samples of severe SMA and heterozygous control littermates were used for analysis. These data were used to evaluate our new bootstrap and rank aggregation approach in view of multi-omics data. Transcriptomic data included two control and two SMA-pooled samples, and proteomics data included four control and four SMA samples. Differential expression analysis was performed for the transcriptomic and proteomic data based on the control and SMA groups. These differentially expressed genes and proteins were further analyzed to determine the enriched pathways and GO terms using the pipeline (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p>The bootGSEA pipeline used the results from the differential expression analysis as input, and GSEA was performed separately for the following types of gene sets: GO terms (BP, MF, and CC), KEGG, Reactome, and WikiPathways databases. The enrichment analysis performed in this pipeline provided two ranking lists of gene sets as outputs: one list from the original analysis of the complete data and one aggregated list from the analysis of the bootstrap samples. The original analysis included the entire list of genes or proteins from differential expression analysis. The bootstrap analysis involved taking the subsampled lists of genes or proteins for enrichment analysis, providing <italic>B</italic> additional ranking lists of results for enrichment analysis. To further determine the robustness of the bootstrap analysis, we aggregated the <italic>B</italic> lists to make them comparable to the ranking of the original analysis. All analyses were initially performed at a single-omics level.</p>
<p>For multi-omics analysis, rank scores from single-omics levels (transcriptomics and proteomics) were further integrated by rank aggregation to obtain an integrated score for the pathways or GO terms retrieved.In the following sections, we describe the results of the differential expression analysis, original findings of the GSEA, bootstrap enrichment analysis, and aggregated results from the six cancer datasets and the two omics levels from the SMA data. First, the results for the renal cancer data are shown, followed by the results for the multi-omics data from SMA mouse kidney samples.To compare the ranking obtained from the original and bootstrap GSEA, we mainly described rank gains and losses of individual GO terms or pathways. We avoided using the correlation coefficient since a correlation of, for example, 0.90 sounds high but can still include large rank differences. Only for the comparison between transcriptomics and proteomics in the SMA example did we use Kendall&#x2019;s <italic>&#x3c4;</italic> to describe the advantage of the bootstrap approach.</p>
<sec id="s3-1">
<title>3.1 Example 1: analysis of renal cancer data</title>
<p>The six microarray datasets were downloaded from the GEO database using the GEOquery package in R. The total number of mRNA transcripts in each dataset used for differential expression analysis was as follows: GSE6344 with 21,225 transcripts, GSE11024 with 17,637 transcripts, GSE14762 with 17,232 transcripts, GSE15641 with 21,225 transcripts, GSE14994 with 21,238 transcripts, and GSE53757 with 44,134 transcripts. We used these data as examples to demonstrate how bootstrap GSEA can help study the robustness of the results across the six datasets.</p>
<sec id="s3-1-1">
<title>3.1.1 Differential expression analysis</title>
<p>Differential expression analysis of mRNA transcripts was performed using the limma package in R for all datasets, and DEGs were filtered based on an adjusted <italic>p</italic>-value <inline-formula id="inf2">
<mml:math id="m2">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula> 0.05. The number of DEGs retrieved was as follows: 6,947 for GSE6344; 4,199 for GSE11024; 7,256 for GSE14762; 9,665 for GSE15641; 10,465 for GSE14994; and 27,968 for GSE53757.</p>
</sec>
<sec id="s3-1-2">
<title>3.1.2 Gene set enrichment analysis</title>
<p>GSEA for each of the six datasets was performed individually, with the total number of transcripts from the differential expression analysis as input. GSEA was performed using the function boot.GO for GO enrichment analysis and boot.pathway for pathway enrichment analysis, obtained from our new package bootGSEA. The analyses were performed separately for each dataset. For example, for the GSE6344 dataset, BP-GO analysis was performed using the boot.GO function, which provides two lists of results: the original analysis and bootstrap analysis. With <italic>B</italic> &#x3d; 100, 100 ranking lists were obtained for the bootstrap analysis. Following this analysis, the rank aggregation approach using aggr.boot.GO function was used to build a score representing the bootstrap analysis. This analysis resulted in a table with GO terms, ranks from each bootstrap run, and the aggregated rank score for each GO term. Similarly, analysis was performed for MF, CC, and pathway analyses (KEGG, Reactome, and WikiPathways) using the boot.pathway function. The resulting table consists of GO terms or pathways, aggregated scores, individual ranks of GO terms or pathways from bootstrap runs, original ranks, Fisher&#x2019;s <italic>p</italic>-value from the original analysis, and bootstrap ranks based on the aggregated score. A summary of the number of annotated GO terms and pathways is given in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Summary information about the gene set enrichment analysis (GSEA) on the six renal cancer datasets. Displayed numbers per dataset are the total number of annotated GO terms or pathways and the number of enriched GO terms or pathways based on Fisher&#x2019;s <italic>p</italic>-value<inline-formula id="inf3">
<mml:math id="m3">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula> 0.05 for the original analysis.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">GSE ID</th>
<th rowspan="2" align="center">GSEA</th>
<th colspan="2" align="center">No. of GO terms or pathways</th>
</tr>
<tr>
<th align="right">Annotated</th>
<th align="right">Significant in the original analysis</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="6" align="left">GSE11024</td>
<td align="left">GO:BP</td>
<td align="right">15,444</td>
<td align="right">1,643</td>
</tr>
<tr>
<td align="left">GO:MF</td>
<td align="right">4,684</td>
<td align="right">525</td>
</tr>
<tr>
<td align="left">GO:CC</td>
<td align="right">1,939</td>
<td align="right">329</td>
</tr>
<tr>
<td align="left">KEGG</td>
<td align="right">343</td>
<td align="right">204</td>
</tr>
<tr>
<td align="left">Reactome</td>
<td align="right">1,489</td>
<td align="right">410</td>
</tr>
<tr>
<td align="left">WikiPathways</td>
<td align="right">600</td>
<td align="right">191</td>
</tr>
<tr>
<td rowspan="6" align="left">GSE11024</td>
<td align="left">GO:BP</td>
<td align="right">15,847</td>
<td align="right">1,465</td>
</tr>
<tr>
<td align="left">GO:MF</td>
<td align="right">4,900</td>
<td align="right">424</td>
</tr>
<tr>
<td align="left">GO:CC</td>
<td align="right">1,977</td>
<td align="right">295</td>
</tr>
<tr>
<td align="left">KEGG</td>
<td align="right">344</td>
<td align="right">55</td>
</tr>
<tr>
<td align="left">Reactome</td>
<td align="right">1,527</td>
<td align="right">93</td>
</tr>
<tr>
<td align="left">WikiPathways</td>
<td align="right">605</td>
<td align="right">57</td>
</tr>
<tr>
<td rowspan="6" align="left">GSE14762</td>
<td align="left">GO:BP</td>
<td align="right">15,799</td>
<td align="right">5,029</td>
</tr>
<tr>
<td align="left">GO:MF</td>
<td align="right">4,889</td>
<td align="right">831</td>
</tr>
<tr>
<td align="left">GO:CC</td>
<td align="right">1,978</td>
<td align="right">521</td>
</tr>
<tr>
<td align="left">KEGG</td>
<td align="right">344</td>
<td align="right">64</td>
</tr>
<tr>
<td align="left">Reactome</td>
<td align="right">1,515</td>
<td align="right">69</td>
</tr>
<tr>
<td align="left">WikiPathways</td>
<td align="right">603</td>
<td align="right">41</td>
</tr>
<tr>
<td rowspan="6" align="left">GSE15641</td>
<td align="left">GO:BP</td>
<td align="right">15,445</td>
<td align="right">1,876</td>
</tr>
<tr>
<td align="left">GO:MF</td>
<td align="right">4,659</td>
<td align="right">537</td>
</tr>
<tr>
<td align="left">GO:CC</td>
<td align="right">1,976</td>
<td align="right">336</td>
</tr>
<tr>
<td align="left">KEGG</td>
<td align="right">343</td>
<td align="right">294</td>
</tr>
<tr>
<td align="left">Reactome</td>
<td align="right">1,489</td>
<td align="right">340</td>
</tr>
<tr>
<td align="left">WikiPathways</td>
<td align="right">600</td>
<td align="right">179</td>
</tr>
<tr>
<td rowspan="6" align="left">GSE14994</td>
<td align="left">GO:BP</td>
<td align="right">15,417</td>
<td align="right">1,796</td>
</tr>
<tr>
<td align="left">GO:MF</td>
<td align="right">4,672</td>
<td align="right">204</td>
</tr>
<tr>
<td align="left">GO:CC</td>
<td align="right">1,942</td>
<td align="right">180</td>
</tr>
<tr>
<td align="left">KEGG</td>
<td align="right">342</td>
<td align="right">171</td>
</tr>
<tr>
<td align="left">Reactome</td>
<td align="right">1,490</td>
<td align="right">594</td>
</tr>
<tr>
<td align="left">WikiPathways</td>
<td align="right">610</td>
<td align="right">202</td>
</tr>
<tr>
<td rowspan="6" align="left">GSE53737</td>
<td align="left">GO:BP</td>
<td align="right">15,927</td>
<td align="right">1,184</td>
</tr>
<tr>
<td align="left">GO:MF</td>
<td align="right">4,944</td>
<td align="right">157</td>
</tr>
<tr>
<td align="left">GO:CC</td>
<td align="right">1,986</td>
<td align="right">108</td>
</tr>
<tr>
<td align="left">KEGG</td>
<td align="right">344</td>
<td align="right">255</td>
</tr>
<tr>
<td align="left">Reactome</td>
<td align="right">1,543</td>
<td align="right">872</td>
</tr>
<tr>
<td align="left">WikiPathways</td>
<td align="right">625</td>
<td align="right">348</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-1-3">
<title>3.1.3 Robustness analysis of pathways and GO terms</title>
<p>Robustness analysis of the pathways and GO terms to evaluate the stability of the identified pathways and GO terms across 100 iterations versus the original analysis was performed as follows. First, a scatter plot was used to identify pathways or GO terms with a high degree of variability in ranks between the original and bootstrapped analyses (ranks based on scores from rank aggregation). Scatter plots for each pathway analysis (KEGG, Reactome, and WikiPathways) and GO terms (BP, MF, and CC) were analyzed for the six datasets. In the following paragraphs, we describe the results exemplarily for the MF-GO enrichment analysis in dataset GSE6344. For the remaining results of this and the other cancer datasets, we refer to <xref ref-type="sec" rid="s11">Supplementary Figures S1&#x2013;S35</xref>. The distribution of the GO-MF terms for dataset GSE6344 is shown in <xref ref-type="fig" rid="F2">Figure 2A</xref>. The <italic>x</italic>-axis displays the ranks from the original analysis, while the <italic>y</italic>-axis shows the aggregated ranks from the bootstrap analysis. The top scatter plot shows all pathways with gain, loss, or retained ranks. The bottom scatter plots show the individual distribution scales of the gain, loss, or retained ranks. Only very few gene sets had the same rank in the original and bootstrap analyses. Therefore, the gain or loss of ranks indicates a certain level of variability in comparison. The higher the variability, the less robust the findings of the original analysis. To further understand this variability of ranks in GO terms, the distribution of rank difference between the original and bootstrapped analyses was analyzed (<xref ref-type="fig" rid="F2">Figure 2B</xref>). We determined the quantiles of this distribution to identify terms with extreme gains or losses. Specifically, GO terms that fall below the 2.5% or exceed the 97.5% quantile have a larger rank difference and are, therefore, less reliable and more susceptible to variation in the data. In this example, the minimum size for rank losses below the 2.5% quantile is 2,091; that is, gene sets that fall below this quantile have a difference in rank of at least 2,091. The minimum rank gain at the 97.5% quantile was 1,525. These gene sets, with such differences in rank, are considered not robust and, therefore, appear rather unsuitable for biological interpretation.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Comparison of the original analysis and bootstrap analysis for Gene Ontology (GO) terms of molecular functions in the renal cancer dataset GSE6344. <bold>(A)</bold> Top: along the <italic>x</italic>-axis are the ranks from the original GSEA, and the <italic>y</italic>-axis corresponds to the ranks from the bootstrap GSEA. The GO terms that have gained, lost, or retained the same rank after 100 bootstrap runs are shown separately in the plots at the bottom. <bold>(B)</bold> Histogram of differences in ranks between the original rank and aggregated rank from the bootstrap GSEA.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g002.tif"/>
</fig>
<p>To better understand and assess the robustness of the gene sets identified in the six independent datasets, the original and bootstrap analyses of the common terms/pathways in at least two datasets among the top 1,000 gene sets ordered based on their Fisher&#x2019;s values and aggregated scores from the bootstrap analysis have been performed (<xref ref-type="fig" rid="F3">Figure 3</xref> <xref ref-type="sec" rid="s11">Supplementary Figure S36</xref>). The plot of GO-MF terms shows a clear difference between the original and bootstrap analyses, indicating that the gene sets found by the original standard analysis might not be robust and require further investigation. In contrast, the gene sets were consistent across the original and bootstrap analyses (<xref ref-type="sec" rid="s11">Supplementary Figure S37</xref>, line plots of KEGG, Reactome, and WikiPathways), indicating that the gene sets were robust and reliable when comparing the results from the six independent datasets.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Comparison analysis between the original and bootstrapped analyses of common GO terms in molecular function among the six renal cell carcinoma (RCC) datasets. A higher agreement between the six datasets is observed when using the ranking of GO terms according to the bootstrap analysis.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g003.tif"/>
</fig>
<p>To further evaluate the robustness of our analysis, a network-based approach was performed for the 58 GO-MF terms obtained from the enrichment analysis of all 6 datasets from the bootstrap and original analyses. The 58 GO-MF terms (<xref ref-type="sec" rid="s11">Supplementary Table S1</xref>) were selected by combining the bootstrap aggregated results of all 6 datasets. The same procedure was performed for the original enrichment analysis. A network was then constructed for these 58 GO terms in REVIGO (<xref ref-type="bibr" rid="B50">Supek et al., 2011</xref>), with each GO term as a node and edges between the nodes if there is a significant correlation between the corresponding gene sets. The constructed network was visualized in Cytoscape (<xref ref-type="bibr" rid="B47">Shannon et al., 2003</xref>) to evaluate the robustness of the network with one of the network metrics in cytoHubba available in Cytoscape. We used the radiality metric to evaluate the robustness. Bootstrap analyses have higher well-connected GO terms in the network when the top 10 GO terms of the original and bootstrap analyses are ranked based on their radiality metric (<xref ref-type="fig" rid="F4">Figure 4</xref>). Comparatively, the original analysis has fewer connections, indicating that the bootstrap analyses provide more robust results.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Comparative network analysis between the original (left) and bootstrapped (right) analyses, including the top 10 where each node represents the GO term among the six RCC datasets.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g004.tif"/>
</fig>
</sec>
</sec>
<sec id="s3-2">
<title>3.2 Example 2: multi-omics of the kidney in a SMA mouse model</title>
<sec id="s3-2-1">
<title>3.2.1 Differential expression analysis</title>
<p>Transcriptomic and proteomic data were pre-processed for differential expression analysis. This processing included the imputation of missing values in the proteomics data, for which the KNN method implemented in the R package &#x201c;impute&#x201d; was used. Differential expression analysis using &#x201c;DESeq2&#x201d; and the &#x201c;limma&#x201d; package in R was performed for transcriptomic and proteomic data, respectively. The analysis retrieved 29,596 transcripts and 7,959 proteins. A total of 81 DEGs and 148 differentially expressed proteins (DEPs) were selected based on the criteria of <italic>p</italic>-value <inline-formula id="inf4">
<mml:math id="m4">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula> 0.05 and &#x7c;<italic>logFC</italic>&#x7c; &#x3e; 1.</p>
</sec>
<sec id="s3-2-2">
<title>3.2.2 Gene set enrichment analysis</title>
<p>GSEA for the original and bootstrap analyses was performed separately at the transcriptomic and proteomic levels using our new R package bootGSEA with the functions boot.GO (for GO analysis) and boot.pathway (for KEGG, Reactome, and WikiPathways). The input data were the results obtained from the differential expression analyses. The analyses resulted in two outputs: original analysis and bootstrapped analysis. The bootstrapped analysis included <italic>B</italic> &#x3d; 100 lists of enrichment analyses based on random subsets of genes and proteins. These 100 lists of enrichment results were aggregated by rank aggregation using the functions aggr.boot.GO (for GO analysis) and aggr.boot.pathway (for KEGG, Reactome, and WikiPathways analyses), where the input is the output from the functions boot.GO and boot.pathway.This analysis was performed for all GO terms (BP, MF, and CC), KEGG, Reactome, and WikiPathways at a single-omics level (transcriptomics and proteomics data individually) of the SMA kidney data (<xref ref-type="table" rid="T3">Table 3</xref>).</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Summary of the gene set enrichment analysis (GSEA) of the spinal muscular atrophy (SMA) mouse data. The total number of annotated GO terms and pathways and the number of enriched GO terms and pathways based on Fisher&#x2019;s value<inline-formula id="inf5">
<mml:math id="m5">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula> 0.05 for the original analysis are given.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="3" align="center">GSEA</th>
<th colspan="4" align="center">No. of GO terms or pathways retrieved</th>
</tr>
<tr>
<th colspan="2" align="center">Transcriptomics</th>
<th colspan="2" align="center">Proteomics</th>
</tr>
<tr>
<th align="right">Annotated</th>
<th align="right">Significant in the original analysis</th>
<th align="right">Annotated</th>
<th align="right">Significant in the original analysis</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">GO:BP</td>
<td align="right">15,995</td>
<td align="right">228</td>
<td align="right">13,670</td>
<td align="right">130</td>
</tr>
<tr>
<td align="left">GO:MF</td>
<td align="right">4,837</td>
<td align="right">99</td>
<td align="right">3,893</td>
<td align="right">58</td>
</tr>
<tr>
<td align="left">GO:CC</td>
<td align="right">2,020</td>
<td align="right">27</td>
<td align="right">1,829</td>
<td align="right">18</td>
</tr>
<tr>
<td align="left">KEGG</td>
<td align="right">337</td>
<td align="right">5</td>
<td align="right">305</td>
<td align="right">22</td>
</tr>
<tr>
<td align="left">Reactome</td>
<td align="right">1,093</td>
<td align="right">38</td>
<td align="right">617</td>
<td align="right">30</td>
</tr>
<tr>
<td align="left">WikiPathways</td>
<td align="right">150</td>
<td align="right">7</td>
<td align="right">89</td>
<td align="right">6</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-2-3">
<title>3.2.3 Robustness analysis of pathways and GO terms at the single-omics level</title>
<p>The original and bootstrapped analyses for the GO terms and pathways of transcriptomics and proteomics data were compared using a scatter plot to analyze the distribution of ranks for their robustness (<xref ref-type="fig" rid="F5">Figure 5</xref>). The GO-CC terms that show an increase in robustness from our pipeline are shown as gain of rank, the terms that lost their rank after <italic>B</italic> bootstrap runs show lesser robustness, and those terms that retained their ranks even after 100 runs retained their ranks. However, the comparison of transcriptomic analyses did not result in any retained ranks (<xref ref-type="fig" rid="F5">Figure 5A</xref>). Therefore, all terms either gained or lost their ranks in this analysis. Proteomics analysis (<xref ref-type="fig" rid="F5">Figure 5B</xref>), on the other hand, had very few retained ranks. The same comparison analysis was performed for the other GO terms and pathways (<xref ref-type="sec" rid="s11">Supplementary Figures S38&#x2013;S42</xref>). It should be noted that transcriptomic analysis for both BP and MF had only one term with a retained rank of 11,489 and 1,249, respectively. However, both terms (GO:0072429 and GO:0004985) were ranked very low, indicating that they were not significantly enriched.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Comparison of the original and bootstrapped analyses for the GO term of cellular components in the transcriptomics <bold>(A)</bold> and proteomics <bold>(B)</bold> spinal muscular atrophy (SMA) kidney data. Top plots: along the <italic>x</italic>-axis is the original enrichment analysis, and the <italic>y</italic>-axis corresponds to the bootstrapped enrichment analysis. The GO terms that have gained, lost, or retained the same rank after <italic>B</italic> &#x3d; 100 bootstrap runs are shown separately in the plots at the bottom.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g005.tif"/>
</fig>
<p>Next, the distribution of rank differences between the original and bootstrapped analyses of the GO-CC terms and pathways was analyzed (<xref ref-type="fig" rid="F6">Figure 6</xref>). The terms and pathways that fall below the 2.5% or exceed the 97.5% quantiles have a very large difference between the original and bootstrapped analyses and should not be considered for biological interpretation. Quantiles to specify terms with extreme changes between the original and bootstrap analyses were determined (<xref ref-type="fig" rid="F6">Figure 6</xref>). For transcriptomics data (<xref ref-type="fig" rid="F6">Figure 6A</xref>), the minimum rank difference for the lost rank below the 2.5% quantile is &#x2212;939.5, that is, GO terms below this quantile have a difference of equal to or more than 939.5 between the original analysis and bootstrap analysis. The minimum rank gain at the 97.5% quantile was 602. Such huge differences in ranks mean that the terms below or above these quantiles should rather not be considered for biological interpretation. For proteomics data (<xref ref-type="fig" rid="F6">Figure 6B</xref>), there was a rank difference of 195.6 and 62.3 for lost and gained ranks, respectively. Histograms for the difference between original and bootstrap rankings related to other GO terms and pathways are given in <xref ref-type="sec" rid="s11">Supplementary Figures S43&#x2013;S47</xref>.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Histogram of the difference in rank of the original rank and the aggregated rank of the bootstrapped analysis in cellular components for transcriptomics and proteomics SMA kidney data. <bold>(A)</bold> Distribution of rank difference in the transcriptomics data of 2,020 terms, where cellular component terms beyond the 2.5% and 97.5% quantiles have a difference of 939.5 and 602, respectively. <bold>(B)</bold> Distribution of rank difference in the proteomics data of 1,829 terms, where cellular component terms beyond the 2.5% and 97.5% quantiles have a difference of 195.6 and 62.3, respectively.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g006.tif"/>
</fig>
</sec>
<sec id="s3-2-4">
<title>3.2.4 Multi-omics analysis</title>
<p>Transcriptomics and proteomics data analyzed individually using the proposed pipeline yielded several GO terms and pathways with increased or decreased robustness, while only few GO terms and pathways retained the same rank after <italic>B</italic> bootstrap runs at the single-omics level. These terms or pathways were further analyzed for common terms using a Venn diagram, where an integrated score for these common terms was obtained by aggregating the ranks between omics levels.</p>
<p>Common cellular component terms among the transcriptomic and proteomic omics levels were retrieved using a Venn diagram (<xref ref-type="fig" rid="F7">Figure 7</xref>). Of the 2,020 terms from the transcriptomics analysis and 1,829 terms from the proteomics analysis, there were 1,823 common GO-CC terms from the analysis, for which an integrated score was obtained for these common terms. A plot comparing the integrated rank score and the original analysis was constructed at each omics level to analyze and evaluate the robustness of the common cellular component terms (<xref ref-type="fig" rid="F8">Figure 8</xref>). The GO-CC terms that showed an increase in robustness from our pipeline were shown as a gain of rank, whereas the terms that lost their rank after 100 bootstrap runs were indicated as a decrease in robustness. The terms that retained their ranks even after 100 runs were deemed robust. However, the comparison of transcriptomics analyses did not result in any retained ranks (<xref ref-type="fig" rid="F8">Figure 8A</xref>), indicating that all terms either gained or lost their rank in this analysis. In contrast, the proteomics analysis (<xref ref-type="fig" rid="F8">Figure 8B</xref>) had very few retained ranks, suggesting that the terms with retained ranks are robust and other terms that have either gained or lost rank ranks are subject to uncertainty.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Venn diagram showing the overlap of significant GO terms for cellular components in the transcriptomics and proteomics data of the SMA study.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g007.tif"/>
</fig>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Scatter plot for the evaluation of the robustness of GO-CC terms comparing the rank of the original analysis and the rank obtained from the integrated score of common terms at the multi-omics level. <bold>(A)</bold> Top: along the <italic>x</italic>-axis is the original rank of transcriptomic analysis, and the <italic>y</italic>-axis corresponds to the rank of common GO-CC terms based on the integrated score. The gained and lost ranks of the GO-CC terms are shown separately in the bottom plots. No ranks were retained in this comparison. <bold>(B)</bold> Top: along the <italic>x</italic>-axis is the original rank of the proteomics analysis, and the <italic>y</italic>-axis corresponds to the rank of common GO-CC terms based on the integrated score. The gained, lost, and retained ranks of the GO-CC terms are presented separately in the bottom plots. Very few terms have retained ranks, meaning that most terms have either gained or lost their robustness after 100 bootstrap runs.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g008.tif"/>
</fig>
<p>To further understand this variability of ranks (gains and losses) in GO terms, we analyzed the distribution of rank differences between the original and integrated analyses (<xref ref-type="fig" rid="F9">Figure 9</xref>). We determined the quantiles of this distribution to identify terms with extreme gains or losses. Specifically, GO terms that fall below the 2.5% or exceed the 97.5% quantile have a larger rank difference and are, therefore, less reliable and more susceptible to variation in data. These gene sets with such large differences in rank are considered less robust and rather not suitable for biological interpretation.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Histogram of the difference in rank of the original rank and the rank obtained from the integrated score of common terms at the multi-omics level. <bold>(A)</bold> Distribution of rank difference in transcriptomics data, where GO-CC terms beyond the 2.5% and 97.5% quantiles have a difference of 961 and 602, respectively. <bold>(B)</bold> Distribution of rank difference in proteomics data, where GO terms beyond the 2.5% and 97.5% quantiles have a difference of 193 and 62, respectively.</p>
</caption>
<graphic xlink:href="fbinf-04-1380928-g009.tif"/>
</fig>
<p>The same procedure was performed for the other GO terms and pathways, where the Venn diagrams of the common terms are given in <xref ref-type="sec" rid="s11">Supplementary Figure S48</xref>. Next, a plot to compare the distribution of ranks between the original analysis and the integrated rank score for these common terms and pathways are given in <xref ref-type="sec" rid="s11">Supplementary Figures S49&#x2013;S53</xref>. Histograms showing the difference in rank based on the integrated score and the original rank from the single-omics level are given in <xref ref-type="sec" rid="s11">Supplementary Figures S54&#x2013;S58</xref>. <xref ref-type="table" rid="T4">Table 4</xref> provides a summary of the top 20 GO-CC terms ranked based on the integrated score obtained from the transcriptomics and proteomics analyses. In addition, the table also includes the robustness rank difference of the GO term on the transcriptomics (robustness T) and proteomics (robustness P) levels. A lesser robustness rank difference indicates that the GO term is more robust across the transcriptomics and proteomics data, while a higher robustness rank difference indicates that the GO term is less robust (refer to <xref ref-type="sec" rid="s11">Supplementary Table S2</xref> (CC) for complete data).</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Top 20 GO terms of cellular components ranked based on the integrated score from the bootstrap analysis. The original ranks of each omics level (original rank T and original rank P), aggregated bootstrap rank (bootstrap rank T and bootstrap rank P), integrated score (the score obtained from common GO terms between omics levels), and robustness (gain T and gain P) of the term (either gain, loss, or the same rank compared with the ranks of the original analysis and the bootstrap analysis). Negative numbers indicate rank loss, and positive numbers indicate rank gain, while zero indicates no rank change.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">GO ID</th>
<th align="left">Term</th>
<th align="right">Orig. rank T</th>
<th align="right">Bt. rank T</th>
<th align="right">Orig. rank P</th>
<th align="right">Bt. rank P</th>
<th align="right">Integrated score</th>
<th align="right">Gain T</th>
<th align="right">Gain P</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">GO:0000109</td>
<td align="left">Nucleotide excision repair complex</td>
<td align="right">202</td>
<td align="right">37</td>
<td align="right">1</td>
<td align="right">2</td>
<td align="right">0.0007</td>
<td align="right">165</td>
<td align="right">&#x2212;1</td>
</tr>
<tr>
<td align="left">GO:0000153</td>
<td align="left">Cytoplasmic ubiquitin ligase complex</td>
<td align="right">217</td>
<td align="right">41</td>
<td align="right">68</td>
<td align="right">62</td>
<td align="right">0.0019</td>
<td align="right">176</td>
<td align="right">6</td>
</tr>
<tr>
<td align="left">GO:0016021</td>
<td align="left">Integral component of the membrane</td>
<td align="right">43</td>
<td align="right">1</td>
<td align="right">277</td>
<td align="right">252</td>
<td align="right">0.0020</td>
<td align="right">42</td>
<td align="right">25</td>
</tr>
<tr>
<td align="left">GO:0005914</td>
<td align="left">Spot adherens junction</td>
<td align="right">503</td>
<td align="right">1,228</td>
<td align="right">2</td>
<td align="right">1</td>
<td align="right">0.0020</td>
<td align="right">&#x2212;725</td>
<td align="right">1</td>
</tr>
<tr>
<td align="left">GO:0000124</td>
<td align="left">SAGA complex</td>
<td align="right">209</td>
<td align="right">27</td>
<td align="right">101</td>
<td align="right">65</td>
<td align="right">0.0021</td>
<td align="right">182</td>
<td align="right">36</td>
</tr>
<tr>
<td align="left">GO:0000813</td>
<td align="left">ESCRT I complex</td>
<td align="right">278</td>
<td align="right">82</td>
<td align="right">76</td>
<td align="right">64</td>
<td align="right">0.0033</td>
<td align="right">196</td>
<td align="right">12</td>
</tr>
<tr>
<td align="left">GO:0031224</td>
<td align="left">Intrinsic component of the membrane</td>
<td align="right">50</td>
<td align="right">2</td>
<td align="right">263</td>
<td align="right">244.5</td>
<td align="right">0.0039</td>
<td align="right">48</td>
<td align="right">18.5</td>
</tr>
<tr>
<td align="left">GO:0001527</td>
<td align="left">Microfibril</td>
<td align="right">298</td>
<td align="right">94</td>
<td align="right">69</td>
<td align="right">47</td>
<td align="right">0.0043</td>
<td align="right">204</td>
<td align="right">22</td>
</tr>
<tr>
<td align="left">GO:0001533</td>
<td align="left">Cornified envelope</td>
<td align="right">299</td>
<td align="right">95</td>
<td align="right">117</td>
<td align="right">87</td>
<td align="right">0.0044</td>
<td align="right">204</td>
<td align="right">30</td>
</tr>
<tr>
<td align="left">GO:0000974</td>
<td align="left">Prp19 complex</td>
<td align="right">293</td>
<td align="right">90</td>
<td align="right">107</td>
<td align="right">103</td>
<td align="right">0.0052</td>
<td align="right">203</td>
<td align="right">4</td>
</tr>
<tr>
<td align="left">GO:0000315</td>
<td align="left">Organellar large ribosomal subunit</td>
<td align="right">239</td>
<td align="right">51</td>
<td align="right">113</td>
<td align="right">110</td>
<td align="right">0.0059</td>
<td align="right">188</td>
<td align="right">3</td>
</tr>
<tr>
<td align="left">GO:0012505</td>
<td align="left">Endomembrane system</td>
<td align="right">59</td>
<td align="right">3</td>
<td align="right">397</td>
<td align="right">371</td>
<td align="right">0.0059</td>
<td align="right">56</td>
<td align="right">26</td>
</tr>
<tr>
<td align="left">GO:0097229</td>
<td align="left">Sperm end piece</td>
<td align="right">1,578</td>
<td align="right">1,768</td>
<td align="right">3</td>
<td align="right">3</td>
<td align="right">0.0059</td>
<td align="right">&#x2212;190</td>
<td align="right">0</td>
</tr>
<tr>
<td align="left">GO:0001917</td>
<td align="left">Photoreceptor inner segment</td>
<td align="right">315</td>
<td align="right">116</td>
<td align="right">36</td>
<td align="right">60</td>
<td align="right">0.0066</td>
<td align="right">199</td>
<td align="right">&#x2212;24</td>
</tr>
<tr>
<td align="left">GO:0002178</td>
<td align="left">Palmitoyltransferase complex</td>
<td align="right">334</td>
<td align="right">125</td>
<td align="right">49</td>
<td align="right">49</td>
<td align="right">0.0076</td>
<td align="right">209</td>
<td align="right">0</td>
</tr>
<tr>
<td align="left">GO:0071944</td>
<td align="left">Cell periphery</td>
<td align="right">86</td>
<td align="right">4</td>
<td align="right">353</td>
<td align="right">319.5</td>
<td align="right">0.0079</td>
<td align="right">82</td>
<td align="right">33.5</td>
</tr>
<tr>
<td align="left">GO:0098965</td>
<td align="left">Extracellular matrix of the synaptic cleft</td>
<td align="right">1,733</td>
<td align="right">1,768</td>
<td align="right">4</td>
<td align="right">4</td>
<td align="right">0.0079</td>
<td align="right">&#x2212;35</td>
<td align="right">0</td>
</tr>
<tr>
<td align="left">GO:0001750</td>
<td align="left">Photoreceptor outer segment</td>
<td align="right">312</td>
<td align="right">110</td>
<td align="right">167</td>
<td align="right">128</td>
<td align="right">0.0080</td>
<td align="right">202</td>
<td align="right">39</td>
</tr>
<tr>
<td align="left">GO:0000835</td>
<td align="left">ER ubiquitin ligase complex</td>
<td align="right">282</td>
<td align="right">130</td>
<td align="right">28</td>
<td align="right">23</td>
<td align="right">0.0082</td>
<td align="right">152</td>
<td align="right">5</td>
</tr>
<tr>
<td align="left">GO:0000836</td>
<td align="left">Hrd1p ubiquitin ligase complex</td>
<td align="right">283</td>
<td align="right">132</td>
<td align="right">29</td>
<td align="right">25</td>
<td align="right">0.0084</td>
<td align="right">151</td>
<td align="right">4</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>These results highlight the importance of analyzing the robustness of GSEA to ensure that the results are reliable and reproducible. By identifying the GO-CC terms that are most robust across different omics levels, researchers can gain a more comprehensive understanding of the biological processes involved in the studied system.</p>
<p>Finally, we compared the ranks obtained from the transcriptomics level with those from the proteomics level separately for the original results and bootstrap results. For the GO categories, we found that transcriptomics and proteomics ranks from the bootstrap analyses are more correlated than the related ranks from the original analysis (BP: Kendall&#x2019;s <italic>&#x3c4;</italic>
<sub>
<italic>orig</italic>.</sub> &#x3d; 0.29 and <italic>&#x3c4;</italic>
<sub>
<italic>boot</italic>
</sub> &#x3d; 0.56; MF: <italic>&#x3c4;</italic>
<sub>
<italic>orig</italic>.</sub> &#x3d; 0.25 and <italic>&#x3c4;</italic>
<sub>
<italic>boot</italic>
</sub> &#x3d; 0.49; and CC: <italic>&#x3c4;</italic>
<sub>
<italic>orig</italic>.</sub> &#x3d; 0.33 and <italic>&#x3c4;</italic>
<sub>
<italic>boot</italic>
</sub> &#x3d; 0.53). For GSEA results with KEGG, Reactome, and WikiPathways databases, no significant correlation between the proteomics and transcriptomics levels was found.</p>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>Set-based enrichment tests are a necessary part of omics data analyses to better understand the biological meaning of differentially expressed genes, proteins, or other molecules. However, because the set compositions provided in the databases are subject to uncertainties, incorrect pathways can emerge from the analysis and lead to biological interpretations in the wrong direction. We observed in the two data examples of this work that pathways or terms obtained from several independent datasets of the same omics domain or different omics domains can have only a moderate overlap. For example, in the SMA dataset, some pathways or GO terms were selected from either transcriptomics or proteomics data analysis. Here, cellular component terms ordered by their integrated score comprise of complex and subunits of complex terms that are indirectly related to neuromuscular diseases. The SMN is a part of the SMN complex and interferes functionally with several other complexes, such as the cytoplasmic ubiquitin ligase complex (GO:0000153) or ER ubiquitin ligase complex (GO:0000835) (<xref ref-type="bibr" rid="B8">Chaytow et al., 2018</xref>), which are highly ranked when ordered by their integrated score but were ranked much lower using the original ranking (<xref ref-type="table" rid="T4">Table 4</xref>). Furthermore, the top GO-BP term (<xref ref-type="sec" rid="s11">Supplementary Table S2</xref> (BP)) with the highest integrated rank (GO:0000245, spliceosomal complex assembly) has been associated with SMA (<xref ref-type="bibr" rid="B41">Price et al., 2018</xref>) but was ranked much lower with the standard GSEA of the single-omics analyses (transcriptomics rank: 1,596; proteomics rank: 81). Furthermore, vitamins play, in general, an important role in neurodegenerative disorders. Lack of water-soluble vitamins (GO:0006767, water-soluble vitamin metabolic process, integrated rank: 6) (<xref ref-type="sec" rid="s11">Supplementary Table S2</xref> (BP)) can lead to neurological diseases (<xref ref-type="bibr" rid="B42">Rai et al., 2021</xref>). In particular, vitamin B6 (GO:0042816, vitamin B6 metabolic process, integrated rank: 2) is necessary for the production of various neurotransmitters such as serotonin, dopamine, and <italic>&#x3b3;</italic>-aminobutyric acid (GABA). Deficiencies in vitamin B6 have been linked to depression and impaired brain function, such as epilepsy (<xref ref-type="bibr" rid="B21">Hellmann and Mooney, 2010</xref>). This suggests that it might be beneficial to consider providing sufficient supplementation of nutrients involved in maintaining an optimal methylation state, including folic acid, vitamin B12, and vitamin B6, for individuals with SMA (<xref ref-type="bibr" rid="B15">Fitzgerald and McArdle, 1941</xref>; <xref ref-type="bibr" rid="B16">Friesen et al., 2001</xref>). Therefore, our new pipeline provides not only robust terms but also biologically relevant terms when ordered by their integrated score.</p>
<p>Several attempts have been made to obtain a more robust enrichment analysis, for example, by integrating information about pathway topology (<xref ref-type="bibr" rid="B12">Draghici et al., 2007</xref>; <xref ref-type="bibr" rid="B18">Glaab et al., 2010</xref>; <xref ref-type="bibr" rid="B40">Massa et al., 2010</xref>) or GO hierarchy (<xref ref-type="bibr" rid="B3">Alexa et al., 2006</xref>) into the algorithms. Other approaches use sample permutations (<xref ref-type="bibr" rid="B49">Subramanian et al., 2005</xref>) or comparisons with results for random gene sets (<xref ref-type="bibr" rid="B32">Kim and Volsky, 2005</xref>) to account for uncertainties. The GAGE method (<xref ref-type="bibr" rid="B38">Luo et al., 2009</xref>) improves the robustness of GSEA by treating curated gene sets as either a pathway or an experimentally derived differential expression set. A method to evaluate the contribution of individual features to the significance of a gene set was presented as well by <xref ref-type="bibr" rid="B28">Jung et al. (2011</xref>).</p>
<p>Here, we present a new approach that combines bootstrap analysis at the gene set level with rank aggregation. This approach accounts for the uncertainty in set compositions by repeatedly analyzing subsets of each gene set. The percentage of genes or proteins to be selected for bootstrap can be chosen by the user of our R package. In case the user assumes much uncertainty in the database, a lower percentage should be taken. A major advantage of this approach is that it can be easily combined with other GSEA approaches. Exemplarily, we demonstrated the combination of our approach with pathway and GO term enrichment analyses implemented using the R packages clusterProfiler and topGO. We showed that overlaps of the detected GO terms between independent datasets were larger when using the bootstrap approach instead of the ranking from the standard analysis (<xref ref-type="fig" rid="F3">Figure 3</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S36</xref>). Similarly, we showed a higher rank correlation for the detected GO terms from transcriptomics and proteomics in the SMA data. Although increased overlaps were not observed for all types of sets, overlaps obtained using the bootstrap analysis were in no case smaller than the overlaps from the standard analysis.</p>
<p>In contrast to other approaches to account for uncertainty in GSEA, the rank aggregation step of our pipeline also allows the combination of results from multiple datasets or omics domains. This could also improve the stability and reproducibility of the findings.</p>
<p>In contrast to the approach proposed by <xref ref-type="bibr" rid="B46">Schmid et al. (2016)</xref>, we derived not only a measure for the robustness of the result for each set but also provided a new ranking of sets. The size of either a gain or loss can be used as a measure of robustness. While we used the 2.5% and 97.5% quantiles, the user is free to use other thresholds to identify sets with extreme gains or losses. Nevertheless, thresholds are useful to account for different numbers of sets in an analysis. When having an overall large number of sets, the values of the quantiles will be larger as well, meaning that larger gains or losses are allowed before flagging a gain or loss as extreme. A disadvantage of the current rank aggregation approach is that the new ranking is based on a score and not on a <italic>p</italic>-value. Therefore, it is a bit more difficult to specify a threshold for the selected sets.</p>
<p>To conclude, set-based analyses now have a long history of omics data analysis to facilitate the biological interpretation of selected features from differential expression analysis. However, the large number of different computational GSEA methods presented in the last two&#xa0;decades and the huge databases with pathway annotations provide an unmanageable number of possible results, and analysts may be conventional in their biological interpretations. Moreover, some entries in the databases may be less supported by experimental findings or by the literature than other entries. In this regard, our bootstrap approach can help separate less robust findings from more robust findings. The rank aggregation step can additionally help combine gene set results from multiple datasets of the same or different omics levels. In particular, a GO term or pathway is only highly ranked by the integrated score if there is evidence for the importance of a term or pathway from different omics levels. We demonstrated the use of our approach in an example with transcriptomics and proteomics data, but it could be extended by GSEA from other omics domains, such as metabolomics (<xref ref-type="bibr" rid="B39">Mahajan et al., 2024</xref>). The rank aggregation step also supports the idea of research synthesis, that is, integrating findings from different studies or data sources to obtain a higher level of scientific evidence. Our new pipeline bootGSEA is universal as it can be combined with the most common GSEA methods. However, when using &#x201c;topGO&#x201d; for GO analysis, which works in the sense of overrepresentation analysis, users must keep in mind that the results depend on thresholds for differentially expressed features.</p>
<p>As a future extension of our approach, we also consider to not only remove features of pathways but to also move features between pathways, which is also an action we observed in databases. This can, however, only be done using biological information about whether a pathway feature makes biological sense in a certain pathway.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. The gene expression data of the cancer study can be publicly retrieved from the NCBI Gene Expression Omnibus database (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/geo/">https://www.ncbi.nlm.nih.gov/geo/</ext-link>) using the accession numbers GSE6344, GSE14762, GSE11024, GSE14994, GSE53757, and GSE15641. The data of the SMA study will be made available by the authors upon request.</p>
</sec>
<sec id="s6">
<title>Ethics statement</title>
<p>The animal study was approved by the Lower Saxony State Office for Consumer Protection and Food Safety (LAVES, Germany, reference number 19/3309). The study was conducted in accordance with the local legislation and institutional requirements.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>SHK: methodology, data curation, formal analysis, software, visualization, and writing&#x2013;original draft. IT: formal analysis, investigation, validation, and writing&#x2013;review and editing. DK: investigation, writing&#x2013;review and editing, and data curation. PC: investigation, writing&#x2013;review and editing, funding acquisition, and project administration. KJ: funding acquisition, writing&#x2013;review and editing, conceptualization, methodology, resources, and supervision.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This project received funding from the European Union&#x2019;s Horizon 2020 Research and Innovation Program under the Marie Sk&#x142;odowska-Curie grant, agreement number 956185.</p>
</sec>
<ack>
<p>The authors thank Mr. Sergej Ruff (Institute for Animal Genomics, University of Veterinary Medicine, Hannover) for his technical support to finalize the R package bootGSEA.</p>
</ack>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fbinf.2024.1380928/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fbinf.2024.1380928/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Table2.XLSX" id="SM1" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Presentation1.PPTX" id="SM2" mimetype="application/PPTX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table1.XLSX" id="SM3" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ackermann</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Strimmer</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>A general modular framework for gene set enrichment analysis</article-title>. <source>BMC Bioinforma.</source> <volume>10</volume>, <fpage>47</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-10-47</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alexa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Rahnenf&#xfc;hrer</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Gene set enrichment analysis with topgo</article-title>. <source>Bioconductor Improv</source> <volume>27</volume>, <fpage>1</fpage>&#x2013;<lpage>26</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alexa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Rahnenf&#xfc;hrer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lengauer</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Improved scoring of functional groups from gene expression data by decorrelating go graph structure</article-title>. <source>Bioinformatics</source> <volume>22</volume>, <fpage>1600</fpage>&#x2013;<lpage>1607</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btl140</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Allardyce</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kuhn</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hernandez-Gerez</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Hensel</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.-T.</given-names>
</name>
<name>
<surname>Faller</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Renal pathology in a mouse model of severe spinal muscular atrophy is associated with downregulation of glial cell-line derived neurotrophic factor (gdnf)</article-title>. <source>Hum. Mol. Genet.</source> <volume>29</volume>, <fpage>2365</fpage>&#x2013;<lpage>2378</lpage>. <pub-id pub-id-type="doi">10.1093/hmg/ddaa126</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bayerlov&#xe1;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kramer</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Klemm</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Bleckmann</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bei&#xdf;barth</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Comparative study on gene set and pathway topology-based enrichment methods</article-title>. <source>BMC Bioinforma.</source> <volume>16</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1186/s12859-015-0751-5</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beissbarth</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Speed</surname>
<given-names>T. P.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Gostat: find statistically overrepresented gene ontologies within a group of genes</article-title>. <source>Bioinformatics</source> <volume>20</volume>, <fpage>1464</fpage>&#x2013;<lpage>1465</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bth088</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bolstad</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Irizarry</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>&#xc5;strand</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Speed</surname>
<given-names>T. P.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>A comparison of normalization methods for high density oligonucleotide array data based on variance and bias</article-title>. <source>Bioinformatics</source> <volume>19</volume>, <fpage>185</fpage>&#x2013;<lpage>193</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/19.2.185</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chaytow</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.-T.</given-names>
</name>
<name>
<surname>Gillingwater</surname>
<given-names>T. H.</given-names>
</name>
<name>
<surname>Faller</surname>
<given-names>K. M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The role of survival motor neuron protein (smn) in protein homeostasis</article-title>. <source>Cell. Mol. Life Sci.</source> <volume>75</volume>, <fpage>3877</fpage>&#x2013;<lpage>3894</lpage>. <pub-id pub-id-type="doi">10.1007/s00018-018-2849-1</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Consortium</surname>
<given-names>G. O.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>The gene ontology (go) database and informatics resource</article-title>. <source>Nucleic acids Res.</source> <volume>32</volume>, <fpage>D258</fpage>&#x2013;<lpage>D261</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkh036</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Copland</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2008</year>). <source>Transcription profiling of human stage 1,2 normal and tumor kidney cancer</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-6344">https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-6344</ext-link>.</comment>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Davis</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Meltzer</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Geoquery: a bridge between the gene expression omnibus (geo) and bioconductor</article-title>. <source>Bioinformatics</source> <volume>14</volume>, <fpage>1846</fpage>&#x2013;<lpage>1847</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btm254</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Draghici</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Khatri</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Tarca</surname>
<given-names>A. L.</given-names>
</name>
<name>
<surname>Amin</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Done</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Voichita</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>A systems biology approach for pathway level analysis</article-title>. <source>Genome Res.</source> <volume>17</volume>, <fpage>1537</fpage>&#x2013;<lpage>1545</lpage>. <pub-id pub-id-type="doi">10.1101/gr.6202607</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dykema</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Furge</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Renal cell carcinoma: hypoxia and endocytosis</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-14762">https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-14762</ext-link>.</comment>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fabregat</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sidiropoulos</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Garapati</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Gillespie</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hausmann</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Haw</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>The reactome pathway knowledgebase</article-title>. <source>Nucleic acids Res.</source> <volume>44</volume>, <fpage>D481</fpage>&#x2013;<lpage>D487</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkv1351</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fitzgerald</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>McArdle</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>1941</year>). <article-title>Vitamins E and B6 in the treatment of muscular dystrophy and motor neurone disease</article-title>. <source>Brain</source> <volume>64</volume>, <fpage>19</fpage>&#x2013;<lpage>42</lpage>. <pub-id pub-id-type="doi">10.1093/brain/64.1.19</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Friesen</surname>
<given-names>W. J.</given-names>
</name>
<name>
<surname>Massenet</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Paushkin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wyce</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dreyfuss</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Smn, the product of the spinal muscular atrophy gene, binds preferentially to dimethylarginine-containing protein targets</article-title>. <source>Mol. Cell.</source> <volume>7</volume>, <fpage>1111</fpage>&#x2013;<lpage>1117</lpage>. <pub-id pub-id-type="doi">10.1016/s1097-2765(01)00244-1</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gillis</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pavlidis</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Assessing identity, redundancy and confounds in gene ontology annotations over time</article-title>. <source>Bioinformatics</source> <volume>29</volume>, <fpage>476</fpage>&#x2013;<lpage>482</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bts727</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Glaab</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Baudot</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Krasnogor</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Valencia</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Topogsa: network topological gene set analysis</article-title>. <source>Bioinformatics</source> <volume>26</volume>, <fpage>1271</fpage>&#x2013;<lpage>1272</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btq131</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goeman</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Van De Geer</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>De Kort</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Van Houwelingen</surname>
<given-names>H. C.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>A global test for groups of genes: testing association with a clinical outcome</article-title>. <source>Bioinformatics</source> <volume>20</volume>, <fpage>93</fpage>&#x2013;<lpage>99</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btg382</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jones</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Otu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Spentzos</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kolia</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Inan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Beecken</surname>
<given-names>W. M.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>Gene signatures of progression and metastasis in renal cell cancer</article-title>. <source>Clinical cancer research</source> <volume>11</volume> (<issue>16</issue>), <fpage>5730</fpage>&#x2013;<lpage>5739</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hellmann</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Mooney</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Vitamin b6: a molecule for human health?</article-title> <source>Molecules</source> <volume>15</volume>, <fpage>442</fpage>&#x2013;<lpage>459</lpage>. <pub-id pub-id-type="doi">10.3390/molecules15010442</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hensel</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Cieri</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Santonicola</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Tapken</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Sch&#xfc;ning</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Taiana</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Impairment of the neurotrophic signaling hub b-raf contributes to motoneuron degeneration in spinal muscular atrophy</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>118</volume>, <fpage>e2007785118</fpage>. <pub-id pub-id-type="doi">10.1073/pnas.2007785118</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hensel</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Raker</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>F&#xf6;rthmann</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Buch</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sodeik</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Pich</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>The proteome and secretome of cortical brain cells infected with herpes simplex virus</article-title>. <source>Front. Neurology</source> <volume>11</volume>, <fpage>844</fpage>. <pub-id pub-id-type="doi">10.3389/fneur.2020.00844</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hensel</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ratzka</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Brinkmann</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Klimaschewski</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Grothe</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Claus</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Analysis of the fibroblast growth factor system reveals alterations in a mouse model of spinal muscular atrophy</article-title>. <source>Plos one</source> <volume>7</volume>, <fpage>e31202</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0031202</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hsieh-Li</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>J.-G.</given-names>
</name>
<name>
<surname>Jong</surname>
<given-names>Y.-J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>M.-H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>N. M.</given-names>
</name>
<name>
<surname>Tsai</surname>
<given-names>C. H.</given-names>
</name>
<etal/>
</person-group> (<year>2000</year>). <article-title>A mouse model for spinal muscular atrophy</article-title>. <source>Nat. Genet.</source> <volume>24</volume>, <fpage>66</fpage>&#x2013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1038/71709</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hummel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Meister</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Mansmann</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Globalancova: exploration and assessment of gene group effects</article-title>. <source>Bioinformatics</source> <volume>24</volume>, <fpage>78</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btm531</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>John Copland</surname>
<given-names>C. K.</given-names>
</name>
<name>
<surname>Christina von Roemeling</surname>
</name>
<name>
<surname>Tun</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2014</year>). <source>Gene array analysis of clear cell renal cell carcinoma tissue versus matched normal kidney tissue</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-53757">https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-53757</ext-link>.</comment>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jung</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Becker</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Brunner</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Bei&#xdf;barth</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Comparison of global tests for functional gene sets in two-group designs and selection of potentially effect-causing genes</article-title>. <source>Bioinformatics</source> <volume>27</volume>, <fpage>1377</fpage>&#x2013;<lpage>1383</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btr152</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kanehisa</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Goto</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>Kegg: kyoto encyclopedia of genes and genomes</article-title>. <source>Nucleic acids Res.</source> <volume>28</volume>, <fpage>27</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1093/nar/28.1.27</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ko</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Mersha</surname>
<given-names>T. B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A roadmap for multi-omics data integration using deep learning</article-title>. <source>Briefings Bioinforma.</source> <volume>23</volume>, <fpage>bbab454</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbab454</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kelder</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Van Iersel</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Hanspers</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kutmon</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Conklin</surname>
<given-names>B. R.</given-names>
</name>
<name>
<surname>Evelo</surname>
<given-names>C. T.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>Wikipathways: building research communities on biological pathways</article-title>. <source>Nucleic acids Res.</source> <volume>40</volume>, <fpage>D1301</fpage>&#x2013;<lpage>D1307</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkr1074</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>S.-Y.</given-names>
</name>
<name>
<surname>Volsky</surname>
<given-names>D. J.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Page: parametric analysis of gene set enrichment</article-title>. <source>BMC Bioinforma.</source> <volume>6</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-6-144</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kolde</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Laur</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Adler</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Vilo</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Robust rank aggregation for gene list integration and meta-analysis</article-title>. <source>Bioinformatics</source> <volume>28</volume>, <fpage>573</fpage>&#x2013;<lpage>580</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btr709</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kort</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2008</year>). <source>Microarray analaysis of adult and childhood renal tumors</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-11024">https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-11024</ext-link>.</comment>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lefebvre</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>B&#xfc;rglen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Reboullet</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Clermont</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Burlet</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Viollet</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>1995</year>). <article-title>Identification and characterization of a spinal muscular atrophy-determining gene</article-title>. <source>Cell.</source> <volume>80</volume>, <fpage>155</fpage>&#x2013;<lpage>165</lpage>. <pub-id pub-id-type="doi">10.1016/0092-8674(95)90460-3</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liberzon</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Subramanian</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pinchback</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Thorvaldsd&#xf3;ttir</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Tamayo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Mesirov</surname>
<given-names>J. P.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Molecular signatures database (msigdb) 3.0</article-title>. <source>Bioinformatics</source> <volume>27</volume>, <fpage>1739</fpage>&#x2013;<lpage>1740</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btr260</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Love</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Anders</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Huber</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Differential analysis of count data&#x2013;the deseq2 package</article-title>. <source>Genome Biol.</source> <volume>15</volume>, <fpage>10</fpage>&#x2013;<lpage>1186</lpage>.</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Friedman</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Shedden</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hankenson</surname>
<given-names>K. D.</given-names>
</name>
<name>
<surname>Woolf</surname>
<given-names>P. J.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Gage: generally applicable gene set enrichment for pathway analysis</article-title>. <source>BMC Bioinforma.</source> <volume>10</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-10-161</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mahajan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Fiehn</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Barupal</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Idsl. Goa: gene ontology analysis for interpreting metabolomic datasets</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>1299</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-51992-x</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Massa</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Chiogna</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Romualdi</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Gene set analysis exploiting the topology of a pathway</article-title>. <source>BMC Syst. Biol.</source> <volume>4</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1186/1752-0509-4-121</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Price</surname>
<given-names>P. L.</given-names>
</name>
<name>
<surname>Morderer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rossoll</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Rnp assembly defects in spinal muscular atrophy</article-title>. <source>RNA Metabolism Neurodegener. Dis.</source> <volume>20</volume>, <fpage>143</fpage>&#x2013;<lpage>171</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-89689-2_6</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rai</surname>
<given-names>S. N.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Steinbusch</surname>
<given-names>H. W.</given-names>
</name>
<name>
<surname>Vamanu</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ashraf</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>M. P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The role of vitamins in neurodegenerative disease: an update</article-title>. <source>Biomedicines</source> <volume>9</volume>, <fpage>1284</fpage>. <pub-id pub-id-type="doi">10.3390/biomedicines9101284</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="book">
<collab>R Core Team</collab> (<year>2022</year>). <source>R: a language and environment for statistical computing</source>. <publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>R Foundation for Statistical Computing</publisher-name>.</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Riessland</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ackermann</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>F&#xf6;rster</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jakubik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hauke</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Garbes</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>Saha ameliorates the sma phenotype in two mouse models for spinal muscular atrophy</article-title>. <source>Hum. Mol. Genet.</source> <volume>19</volume>, <fpage>1492</fpage>&#x2013;<lpage>1506</lpage>. <pub-id pub-id-type="doi">10.1093/hmg/ddq023</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ritchie</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Phipson</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Law</surname>
<given-names>C. W.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Limma powers differential expression analyses for rna-sequencing and microarray studies</article-title>. <source>Nucleic acids Res.</source> <volume>43</volume>, <fpage>e47</fpage>. <pub-id pub-id-type="doi">10.1093/nar/gkv007</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schmid</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Schmid</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>M&#xfc;ssel</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Str&#xe4;ng</surname>
<given-names>J. E.</given-names>
</name>
<name>
<surname>Buske</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bullinger</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Giant: gene set uncertainty in enrichment analysis</article-title>. <source>Bioinformatics</source> <volume>32</volume>, <fpage>1891</fpage>&#x2013;<lpage>1894</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btw030</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shannon</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Markiel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ozier</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Baliga</surname>
<given-names>N. S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Ramage</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2003</year>). <article-title>Cytoscape: a software environment for integrated models of biomolecular interaction networks</article-title>. <source>Genome Res.</source> <volume>13</volume>, <fpage>2498</fpage>&#x2013;<lpage>2504</lpage>. <pub-id pub-id-type="doi">10.1101/gr.1239303</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Signoretti</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Beroukhim</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2010</year>). <source>Patterns of gene expression and copy-number alterations in VHL disease-associated and sporadic ccRCC</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-14994">https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-GEOD-14994</ext-link>.</comment>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Subramanian</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tamayo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Mootha</surname>
<given-names>V. K.</given-names>
</name>
<name>
<surname>Mukherjee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ebert</surname>
<given-names>B. L.</given-names>
</name>
<name>
<surname>Gillette</surname>
<given-names>M. A.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>Gene set enrichment analysis: a knowledge-based approach for interpreting genome-wide expression profiles</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>102</volume>, <fpage>15545</fpage>&#x2013;<lpage>15550</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0506580102</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Supek</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Bo&#x161;njak</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>&#x160;kunca</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>&#x160;muc</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Revigo summarizes and visualizes long lists of gene ontology terms</article-title>. <source>PloS one</source> <volume>6</volume>, <fpage>e21800</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0021800</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>V&#xe4;likangas</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Suomi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Elo</surname>
<given-names>L. L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A comprehensive evaluation of popular proteomics software workflows for label-free proteome quantification and imputation</article-title>. <source>Briefings Bioinforma.</source> <volume>19</volume>, <fpage>1344</fpage>&#x2013;<lpage>1355</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbx054</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Mezlini</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Demir</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fiume</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Brudno</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Similarity network fusion for aggregating data types on a genomic scale</article-title>. <source>Nat. methods</source> <volume>11</volume>, <fpage>333</fpage>&#x2013;<lpage>337</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.2810</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Clusterprofiler 4.0: a universal enrichment tool for interpreting omics data</article-title>. <source>Innovation</source> <volume>2</volume>, <fpage>100141</fpage>. <pub-id pub-id-type="doi">10.1016/j.xinn.2021.100141</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiong</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ancona</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hauser</surname>
<given-names>E. R.</given-names>
</name>
<name>
<surname>Mukherjee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Furey</surname>
<given-names>T. S.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Integrating genetic and gene expression evidence into genome-wide association analysis of gene sets</article-title>. <source>Genome Res.</source> <volume>22</volume>, <fpage>386</fpage>&#x2013;<lpage>397</lpage>. <pub-id pub-id-type="doi">10.1101/gr.124370.111</pub-id>
</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Goh</surname>
<given-names>W. W. B.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>How to do quantile normalization correctly for gene expression data analyses</article-title>. <source>Sci. Rep.</source> <volume>10</volume>, <fpage>15534</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-72664-6</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>