<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article article-type="methods-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1211819</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2023.1211819</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Bioinformatics</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Orthogonal outlier detection and dimension estimation for improved MDS embedding of biological datasets</article-title>
<alt-title alt-title-type="left-running-head">Li et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2023.1211819">10.3389/fbinf.2023.1211819</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Wanxin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2354273/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mirone</surname>
<given-names>Jules</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Prasad</surname>
<given-names>Ashok</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1393045/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Miolane</surname>
<given-names>Nina</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Legrand</surname>
<given-names>Carine</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2291969/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Dao Duc</surname>
<given-names>Khanh</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2243653/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Computer Science</institution>, <institution>University of British Columbia</institution>, <addr-line>Vancouver</addr-line>, <addr-line>BC</addr-line>, <country>Canada</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Mathematics</institution>, <institution>University of British Columbia</institution>, <addr-line>Vancouver</addr-line>, <addr-line>BC</addr-line>, <country>Canada</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Centre de Math&#xe9;matiques Appliqu&#xe9;es</institution>, <institution>Ecole Polytechnique</institution>, <addr-line>Palaiseau</addr-line>, <country>France</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Chemical and Biological Engineering</institution>, <institution>School of Biomedical Engineering</institution>, <institution>Colorado State University</institution>, <addr-line>Fort Collins</addr-line>, <addr-line>CO</addr-line>, <country>United States</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Department of Electrical and Computer Engineering</institution>, <institution>University of California, Santa Barbara</institution>, <addr-line>Santa Barbara</addr-line>, <addr-line>CA</addr-line>, <country>United States</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Universit&#x00E9; Paris Cit&#x00E9;, G&#x00E9;nomes, biologie cellulaire et th&#x00E9;rapeutique U944, INSERM, CNRS</institution>, <addr-line>Paris</addr-line>, <country>France</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1803995/overview">Martin Hemberg</ext-link>, Brigham and Women&#x2019;s Hospital and Harvard Medical School, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1057597/overview">Virginie Uhlmann</ext-link>, European Bioinformatics Institute (EMBL-EBI), United Kingdom</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2316382/overview">Nikolaos Patikas</ext-link>, Brigham and Women&#x2019;s Hospital and Harvard Medical School, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Carine Legrand, <email>carine.legrand@inserm.fr</email>; Khanh Dao Duc, <email>kdd@math.ubc.ca</email>
</corresp>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>10</day>
<month>08</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>3</volume>
<elocation-id>1211819</elocation-id>
<history>
<date date-type="received">
<day>03</day>
<month>05</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>07</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Li, Mirone, Prasad, Miolane, Legrand and Dao Duc.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Li, Mirone, Prasad, Miolane, Legrand and Dao Duc</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Conventional dimensionality reduction methods like Multidimensional Scaling (MDS) are sensitive to the presence of orthogonal outliers, leading to significant defects in the embedding. We introduce a robust MDS method, called <italic>DeCOr-MDS</italic> (Detection and Correction of Orthogonal outliers using MDS), based on the geometry and statistics of simplices formed by data points, that allows to detect orthogonal outliers and subsequently reduce dimensionality. We validate our methods using synthetic datasets, and further show how it can be applied to a variety of large real biological datasets, including cancer image cell data, human microbiome project data and single cell RNA sequencing data, to address the task of data cleaning and visualization.</p>
</abstract>
<kwd-group>
<kwd>orthogonal outliers</kwd>
<kwd>outlier detection</kwd>
<kwd>outlier correction</kwd>
<kwd>multidimensional scaling</kwd>
<kwd>shape data</kwd>
<kwd>microbiome data</kwd>
<kwd>scRNA seq</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Single Cell Bioinformatics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Multidimensional scaling (MDS) is a commonly used and fast method of data exploration and dimension reduction, with the unique capacity to take non-euclidean dissimilarities as its input. However, sensitivity to outliers is a major drawback (<xref ref-type="bibr" rid="B13">Harmeling et al., 2005</xref>; <xref ref-type="bibr" rid="B6">Blouvshtein and Cohen-Or, 2019</xref>). As arbitrary removal of outliers is undesirable, a possible alternative is to detect outliers and accommodate their influence on the MDS embedding, thus leveraging the information contained in outlying points.</p>
<p>Outlier detection has been widely used in biological data. Sheih and Yeung proposed a method using principal component analysis (PCA) and robust estimation of Mahalanobis distances to detect outlier samples in microarray data (<xref ref-type="bibr" rid="B29">Shieh and Hung, 2009</xref>). Chen <italic>et al.</italic> reported the use of two PCA methods to uncover outlier samples in multiple simulated and real RNA-seq data (<xref ref-type="bibr" rid="B24">Oh et al., 2008</xref>). Outlier influence can be mitigated depending on the specific type of outlier. In-plane outliers and bad leverage points can be harnessed using <italic>&#x2113;</italic>
<sub>1</sub>-norm (<xref ref-type="bibr" rid="B32">Spence and Lewandowsky, 1989</xref>; <xref ref-type="bibr" rid="B9">Cayton and Dasgupta, 2006</xref>; <xref ref-type="bibr" rid="B12">Forero and Giannakis, 2012</xref>), correntropy or M-estimators (<xref ref-type="bibr" rid="B17">Mandanas and Kotropoulos, 2017</xref>). Outliers which violate the triangular inequality can be detected and corrected based on their pairwise distances (<xref ref-type="bibr" rid="B6">Blouvshtein and Cohen-Or, 2019</xref>). Orthogonal outliers are another particular case, where outliers have an important component, orthogonal to the hyperspace where most data is located. These outliers often do not violate the triangular inequality, and thus require an alternative approach.</p>
<p>Although MDS is known to be sensitive to such orthogonal outliers (<xref ref-type="bibr" rid="B31">Song et al., 2007</xref>; <xref ref-type="bibr" rid="B14">Legrand, 2017</xref>), none of the existing methods addresses this issue, to the best of our knowledge. We present here a robust MDS method, called <italic>DeCOr-MDS</italic>, Detection and Correction of Orthogonal outliers using MDS. <italic>DeCOr-MDS</italic> takes advantage of geometrical characteristics of the data to reduce the influence of orthogonal outliers, and estimate the dimension of the dataset. Our paper is organized as follows. We first describe the procedure and its implementation in detail. We then validate our method on synthetic data to confirm the accuracy and characterize the importance of different parts of our procedure. We further run the method on different experimental datasets from single cell images, microbiome sequencing data, and scRNA-seq data. Our experiments show that <italic>DeCOr-MDS</italic> can detect artefacts in cell shape data, improve the visualization of clusters in microbiome data, and be used as a step for quality control for scRNA-seq data, illustrating how it can be broadly applied to interpret and improve the performance of MDS on biological datasets. Finally, we discuss the advantages and limitations of our method and future directions.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Background: height and volume of n-simplices</title>
<p>We recall some geometric properties of simplices, which our method is based on. For a set of <italic>n</italic> points (<italic>x</italic>
<sub>1</sub>, &#x2026; , <italic>x</italic>
<sub>
<italic>n</italic>
</sub>), the associated <italic>n</italic>-simplex is the polytope of vertices (<italic>x</italic>
<sub>1</sub>, &#x2026; , <italic>x</italic>
<sub>
<italic>n</italic>
</sub>) (a 3-simplex is a triangle, a 4-simplex is a tetrahedron and so on). The height <italic>h</italic>(<italic>V</italic>
<sub>
<italic>n</italic>
</sub>, <italic>x</italic>) of a point <italic>x</italic> belonging to a <italic>n</italic>-simplex <italic>V</italic>
<sub>
<italic>n</italic>
</sub> can be obtained as (<xref ref-type="bibr" rid="B30">Sommerville, 1929</xref>)<disp-formula id="e1">
<mml:math id="m1">
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>where <italic>V</italic>
<sub>
<italic>n</italic>
</sub> is the volume of the <italic>n</italic>-simplex, and <italic>V</italic>
<sub>
<italic>n</italic>&#x2212;1</sub> is the volume of the (<italic>n</italic> &#x2212; 1)-simplex obtained by removing the point <italic>x</italic>. <italic>V</italic>
<sub>
<italic>n</italic>
</sub> and <italic>V</italic>
<sub>
<italic>n</italic>&#x2212;1</sub> can be computed using the pairwise distances only, with the Cayley-Menger formula (<xref ref-type="bibr" rid="B30">Sommerville, 1929</xref>):<disp-formula id="e2">
<mml:math id="m2">
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="italic">det</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>!</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>where <italic>det</italic>(<italic>CM</italic>
<sub>
<italic>n</italic>
</sub>) is the determinant of the Cayley-Menger matrix <italic>CM</italic>
<sub>
<italic>n</italic>
</sub>, that contains the pairwise distances <inline-formula id="inf1">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>, as<disp-formula id="e3">
<mml:math id="m4">
<mml:mi>C</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2,1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-2">
<title>2.2 Orthogonal outlier detection and dimensionality estimation</title>
<p>We now consider a dataset <bold>X</bold> of size <italic>N</italic> &#xd7; <italic>d</italic>, where <italic>N</italic> is the sample size and <italic>d</italic> the dimension of the data. We associate with <bold>X</bold> a matrix <bold>D</bold> of size <italic>N</italic> &#xd7; <italic>N</italic>, which represents all the pairwise distances between observations of <bold>X</bold>. We also assume that the data points can be mapped into a vector space with <italic>regular observations</italic> that form a <italic>main</italic> subspace of unknown dimension <italic>d</italic>&#x2a; with some small noise, and additional <italic>orthogonal outliers</italic> of relatively large orthogonal distance to the main subspace (<xref ref-type="fig" rid="F1">Figure 1A</xref>). Our proposed method aims to infer from <bold>D</bold> the dimension of the main data subspace <italic>d</italic>&#x2a;, using the geometric properties of simplices with respect to their number of vertices: Consider a (<italic>n</italic> &#x2b; 2)-simplex containing a data point <italic>x</italic>
<sub>
<italic>i</italic>
</sub> and its associated height, that can be computed using Eq. <xref ref-type="disp-formula" rid="e1">1</xref> in <xref ref-type="sec" rid="s2-1">Section 2.1</xref>. When <italic>n</italic> &#x3c; <italic>d</italic>&#x2a; and for <italic>S</italic> large enough, the distribution of heights obtained from different simplices containing <italic>x</italic>
<sub>
<italic>i</italic>
</sub> remains similar, whether <italic>x</italic>
<sub>
<italic>i</italic>
</sub> is an orthogonal outlier or a regular observation (see <xref ref-type="fig" rid="F1">Figure 1B</xref>). In contrast, when <italic>n</italic> &#x2265; <italic>d</italic>&#x2a;, the median of these heights approximately yields the distance of <italic>x</italic>
<sub>
<italic>i</italic>
</sub> to the main subspace (<xref ref-type="fig" rid="F1">Figure 1C</xref>). This distance should be significantly larger when <italic>x</italic>
<sub>
<italic>i</italic>
</sub> is an orthogonal outlier, compared with regular points, for which these distances are tantamount to the noise.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Example of a dataset with orthogonal outliers and n-simplices. <bold>(A)</bold> Representation of a dataset with regular data points (blue) belonging to a main subspace of dimension 2 with some noise, and orthogonal outliers (red triangle symbols) in the third dimension. <bold>(B)</bold> View of two instances of 3-simplices (triangles), one with only regular points (left) and the other one containing one outlier (right). The height drawn from the outlier is close to the height of the regular triangle. <bold>(C)</bold> Upon adding other regular points to obtain tetrahedrons (4-simplices), the height drawn from the outlier (right) becomes significantly larger than the height drawn from the same point (left) as in <bold>(B)</bold>.</p>
</caption>
<graphic xlink:href="fbinf-03-1211819-g001.tif"/>
</fig>
<p>To estimate <italic>d</italic>&#x2a; and for a given dimension <italic>n</italic> tested, we thus randomly sample, for every <italic>x</italic>
<sub>
<italic>i</italic>
</sub> in <bold>X</bold>, <italic>S</italic>(<italic>n</italic> &#x2b; 2)-simplices containing <italic>x</italic>
<sub>
<italic>i</italic>
</sub>, and compute the median of the heights <inline-formula id="inf2">
<mml:math id="m5">
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> associated with these <italic>S</italic> simplices. Upon considering, as a function of the dimension <italic>n</italic> tested, the distribution of median heights <inline-formula id="inf3">
<mml:math id="m6">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> (with 1 &#x2264; <italic>i</italic> &#x2264; <italic>N</italic>), we then identify <italic>d</italic>&#x2a; as the dimension at which this function presents a sharp transition towards a highly peaked distribution at zero. To do so, we compute <inline-formula id="inf4">
<mml:math id="m7">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, as the mean of <inline-formula id="inf5">
<mml:math id="m8">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and estimate <italic>d</italic>&#x2a; as<disp-formula id="e4">
<mml:math id="m9">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>argmax</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>Furthermore, we detect orthogonal outliers using the distribution obtained in <inline-formula id="inf6">
<mml:math id="m10">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, as the points for which <inline-formula id="inf7">
<mml:math id="m11">
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> largely stands out from <inline-formula id="inf8">
<mml:math id="m12">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>. To do so, we compute <inline-formula id="inf9">
<mml:math id="m13">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> the standard deviation observed for the distribution <inline-formula id="inf10">
<mml:math id="m14">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and obtain the set of orthogonal outliers <bold>O</bold> as<disp-formula id="e5">
<mml:math id="m15">
<mml:mi mathvariant="bold">O</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mspace width="0.28em"/>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mspace width="0.28em"/>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3e;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>where <italic>c</italic> &#x3e; 0 is a parameter set to achieve a reasonable trade-off between outlier detection and false detection of noisy observations. Our implementation uses <italic>c</italic> &#x3d; 3 by default (following the three <italic>&#x3c3;</italic> rule <xref ref-type="bibr" rid="B25">Pukelsheim (1994)</xref>, and which corresponds to <inline-formula id="inf11">
<mml:math id="m16">
<mml:mo>&#x223c;</mml:mo>
<mml:mn>99.9</mml:mn>
<mml:mi>%</mml:mi>
</mml:math>
</inline-formula> of a Gaussian distribution being conserved), value which was also used in our experiments. In case users possess prior information or want to control the fraction of detected outliers, the value of <italic>c</italic> may be modified, with increasing <italic>c</italic> making the detection stricter. Also note that our method introduces another parameter <italic>S</italic>, as it samples <italic>S</italic> simplices to calculate the median of the corresponding heights. Therefore, <italic>S</italic> should be large enough so the resulting sample median well approximates the global median. Assuming the heights being sampled from a continuous distribution, this can be guaranteed as the sample median is asymptotically normal, with mean equal to the true median and the standard deviation proportional to <inline-formula id="inf12">
<mml:math id="m17">
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B26">Rider, 1960</xref>).</p>
</sec>
<sec id="s2-3">
<title>2.3 Correcting the dimensionality estimation for a large outlier fraction</title>
<p>The method presented in the previous section assumes that at dimension <italic>d</italic>&#x2a;, the median height calculated for each point reflects the distance to the main subspace. This assumption is valid when the fraction of orthogonal outliers is small enough, so that the sampled <italic>n</italic>-simplex likely contains regular observations only, aside from the evaluated point. However, if the number of outliers gets large enough so that a significant fraction of <italic>n</italic>-simplices also contains outliers, then the calculated heights would yield the distance between <italic>x</italic>
<sub>
<italic>i</italic>
</sub> and an outlier-containing hyperplane, whose dimension is larger than a hyperplane containing only regular observations. The apparent dimensionality of the main subspace would thus increase and generates a positive bias on the estimate of <italic>d</italic>&#x2a;.</p>
<p>Specifically, if <bold>X</bold> contains a fraction of <italic>p</italic> outliers, and if we consider <italic>o</italic>
<sub>
<italic>n</italic>,<italic>p</italic>,<italic>N</italic>
</sub> the number of outliers drawn after uniformly sampling <italic>n</italic> &#x2b; 1 points (to test the dimension <italic>n</italic>), then <italic>o</italic>
<sub>
<italic>n</italic>,<italic>p</italic>,<italic>N</italic>
</sub> follows a hypergeometric law, with parameters <italic>n</italic> &#x2b; 1, the fraction of outliers <italic>p</italic> &#x3d; <italic>N</italic>
<sub>
<italic>o</italic>
</sub>/<italic>N</italic>, and <italic>N</italic>. Thus, the expected number of outliers drawn from a sampled simplex is (<italic>n</italic> &#x2b; 1) &#xd7; <italic>p</italic>. After estimating <inline-formula id="inf13">
<mml:math id="m18">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> (from <xref ref-type="sec" rid="s2-2">Section 2.2</xref>), and finding a proportion of outliers <inline-formula id="inf14">
<mml:math id="m19">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="bold">O</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>/</mml:mo>
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> using Eq. <xref ref-type="disp-formula" rid="e5">5</xref>, we hence correct <inline-formula id="inf15">
<mml:math id="m20">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> <italic>a posteriori</italic> by substracting the estimated bias <italic>&#x3b4;</italic>, as the integer part of the expectation of <italic>o</italic>
<sub>
<italic>n</italic>,<italic>p</italic>,<italic>N</italic>
</sub>, so the debiased dimensionality estimate <italic>n</italic>&#x2a; is<disp-formula id="e6">
<mml:math id="m21">
<mml:msup>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x230b;</mml:mo>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-4">
<title>2.4 Outlier distance correction</title>
<p>Upon identifying the main subspace containing regular points, our procedure finally corrects the pairwise distances that contain outliers in the matrix <bold>D</bold>, in order to apply a MDS that projects the outliers in the main subspace. In the case where the original coordinates cannot be used (e.g., as a result of some transformation or if the distance is non Euclidean), we perform the two following steps: 1) We first apply a MDS on <bold>D</bold> to place the points in a euclidean space of dimension <italic>d</italic>, as a new matrix of coordinates <inline-formula id="inf16">
<mml:math id="m22">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. 2) We run a PCA on the full coordinates of the estimated set of regular data points (i.e., <inline-formula id="inf17">
<mml:math id="m23">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x5c;</mml:mo>
<mml:mi>O</mml:mi>
</mml:math>
</inline-formula>), and project the outliers along the first <inline-formula id="inf18">
<mml:math id="m24">
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> principal components of the PCA, since these components are sufficient to generate the main subspace. Using the projected outliers, we accordingly update the pairwise distances in <bold>D</bold> to obtain the corrected distance matrix <bold>D&#x2a;</bold>. Note that in the case where <bold>D</bold> derives from a euclidean distance between the original coordinates, we can skip step 1), and directly run step 2) on the full coordinates of the estimated set of regular data points.</p>
</sec>
<sec id="s2-5">
<title>2.5 Overall procedure and implementation</title>
<p>The overall procedure, called DeCOr-MDS, is described in <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>. The values for the parameters <italic>S</italic> and <italic>c</italic> were set by default and in our experiments to <italic>S</italic> &#x3d; 100 and <italic>c</italic> &#x3d; 3. We also provide an implementation in Python 3.8.10 available on this github repository: <ext-link ext-link-type="uri" xlink:href="https://github.com/wxli0/DeCOr-MDS">https://github.com/wxli0/DeCOr-MDS</ext-link>.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>DeCOr-MDS.<list list-type="simple">
<list-item>
<p>
<bold>Input</bold> <italic>D</italic> the pairwise distance matrix of the dataset of size <italic>N</italic> &#xd7; <italic>d</italic>, <italic>E</italic>
<sub>dim</sub> the set of dimensions <inline-formula id="inf19">
<mml:math id="m25">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> to be tested, <italic>c</italic> and <italic>S</italic> user-specified constants</p>
</list-item>
<list-item>
<p>
<bold>Output</bold> <inline-formula id="inf20">
<mml:math id="m26">
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> the relevant dimension of the dataset, <italic>O</italic> the list of orthogonal outliers, and <italic>D</italic>&#x2a; the matrix of corrected pairwise distances</p>
</list-item>
<list-item>
<p>
<bold>for</bold> <italic>n</italic> in <italic>E</italic>
<sub>
<italic>dim</italic>
</sub> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;<bold>for</bold> i in [1,N] <bold>do</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>for</bold> j in [1,S] <bold>do</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;Sample a (<italic>n</italic> &#x2b; 2)-simplex <italic>V</italic>
<sub>
<italic>i</italic>,<italic>j</italic>
</sub> containing <italic>x</italic>
<sub>
<italic>i</italic>
</sub>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;Compute the height (using <italic>D</italic> and Eq. <xref ref-type="disp-formula" rid="e1">1</xref>) <inline-formula id="inf21">
<mml:math id="m27">
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2254;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf22">
<mml:math id="m28">
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2254;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf23">
<mml:math id="m29">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2254;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf24">
<mml:math id="m30">
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2254;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf25">
<mml:math id="m31">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2254;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>arg max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf26">
<mml:math id="m32">
<mml:mi>O</mml:mi>
<mml:mo>&#x2254;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mspace width="0.28em"/>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mspace width="0.28em"/>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3e;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<italic>p</italic>&#x2254;&#x7c;<italic>O</italic>&#x7c;/<italic>N</italic>
</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf27">
<mml:math id="m33">
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x230b;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>(Skip if using original coordinates) Apply a MDS on <italic>D</italic> to create an euclidean space of dimension <italic>d</italic>, resulting <inline-formula id="inf28">
<mml:math id="m34">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>Apply a PCA on <inline-formula id="inf29">
<mml:math id="m35">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x5c;</mml:mo>
<mml:mi>O</mml:mi>
</mml:math>
</inline-formula> to get the main subspace of dimensionality <inline-formula id="inf30">
<mml:math id="m36">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<bold>for</bold> outlier <italic>i</italic> in <italic>O</italic> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;Project <italic>x</italic>
<sub>
<italic>i</italic>
</sub> on the main subspace, and correct the coordinates of <italic>x</italic>
<sub>
<italic>i</italic>
</sub> in <inline-formula id="inf31">
<mml:math id="m37">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>Recompute the pairwise distance matrix <italic>D</italic>&#x2a; from <inline-formula id="inf32">
<mml:math id="m38">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p>
<bold>return</bold> <inline-formula id="inf33">
<mml:math id="m39">
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, <italic>O</italic> and <italic>D</italic>&#x2a;</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>The complexity of the algorithm can be briefly evaluated as follows.<list list-type="simple">
<list-item>
<p>1. Given one <italic>n</italic>-simplex, the volume computation has a complexity of <inline-formula id="inf34">
<mml:math id="m40">
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Since we compute the height for <italic>S</italic> simplices and repeat the process for all <italic>E</italic>
<sub>dim</sub> dimensions, the total complexity of this step amounts to <inline-formula id="inf35">
<mml:math id="m41">
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>N</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>dim</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>,</p>
</list-item>
<list-item>
<p>2. The complexity of PCA over the regular data points is <inline-formula id="inf36">
<mml:math id="m42">
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>,</p>
</list-item>
<list-item>
<p>3. The complexity of MDS over the pairwise distance matrix <italic>D</italic> is <inline-formula id="inf37">
<mml:math id="m43">
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>,</p>
</list-item>
<list-item>
<p>4. The computation of the corrected distances matrix is <inline-formula id="inf38">
<mml:math id="m44">
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
</list>
</p>
<p>Note that with the tested dimensions being smaller than the data dimension (<italic>n</italic>, <italic>E</italic>
<sub>dim</sub> &#x3c; <italic>d</italic>), and the number of simplices being significantly smaller than the total number of data points (<italic>S</italic> &#x226a; <italic>N</italic>), the burden of evaluating the simplices (step 1) and correcting outliers (step 4) is in practice less than the cost of the PCA (step 2) and MDS (step 3).</p>
</sec>
<sec id="s2-6">
<title>2.6 Datasets</title>
<sec id="s2-6-1">
<title>2.6.1 Synthetic datasets</title>
<p>The &#x201c;cross&#x201d; dataset (<xref ref-type="bibr" rid="B32">Spence and Lewandowsky, 1989</xref>), which is a two-dimensional dataset representing a simple cross structure (<xref ref-type="fig" rid="F2">Figure 2</xref>) was generated with <italic>N</italic> &#x3d; 25 points, and <italic>d</italic>&#x2a; &#x3d; 2. We introduced orthogonal outliers by randomly sampling three points and by adding a third coordinate of random amplitude to them. Other synthetic datasets were generated by sampling Gaussian-distributed coordinates in the main subspace, and adding some small noise in the whole space with variance between 0.0001 and 0.0003. A fraction <italic>p</italic> of the points was considered to define the orthogonal outliers, with coordinates modified by randomly increasing the coordinate(s) orthogonal to the plane; the amount increased is drawn from a uniform distribution between &#x2212;30 and 30 or -100 to 100. These datasets were generated for a main subspace of dimension 2, 10 and 40, with <italic>p</italic> &#x3d; 0.05 and <italic>N</italic> &#x3d; 200 for dimension <italic>d</italic>&#x2a; &#x3d; 2, <italic>p</italic> &#x3d; 0.05 and <italic>N</italic> &#x3d; 1000 for dimension <italic>d</italic>&#x2a; &#x3d; 10, and <italic>p</italic> varying between 0.02 and 0.1 for <italic>d</italic>&#x2a; &#x3d; 40, and <italic>N</italic> &#x3d; 1, 000. For all the synthetic datasets, the pairwise distance matrix was calculated using the Euclidean distance.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Application of DeCOr-MDS on a cross dataset. <bold>(A)</bold> Original cross dataset. The points selected to be orthogonal outliers are highlighted in red. <bold>(B)</bold> MDS embedding of the original data with an outlying component added to the selected points. <bold>(C)</bold> MDS embedding after preprocessing using DeCOr-MDS. Note that after correction, we recover the original cross structure.</p>
</caption>
<graphic xlink:href="fbinf-03-1211819-g002.tif"/>
</fig>
</sec>
<sec id="s2-6-2">
<title>2.6.2 Cell shape dataset</title>
<p>The cell shape dataset contains mouse osteosarcoma 2D imaged cells (<xref ref-type="bibr" rid="B2">Alizadeh et al., 2019</xref>), that were processed into a 100 &#xd7; 2 vector of coordinates that define the cell shape contour, used as a test dataset in the Python package Geomstats (<xref ref-type="bibr" rid="B22">Miolane et al., 2020</xref>) (for more details, see also (<xref ref-type="bibr" rid="B21">Miolane et al., 2021</xref>) and the associated Github link). We more specifically considered the subset of &#x201c;DUNN&#x201d; cells (that denotes a specific lineage) from the control group (no treatment on the cells), which yields 207 cells in total. The pairwise distance matrix of all cell shapes was obtained from the same reference (<xref ref-type="bibr" rid="B22">Miolane et al. (2020</xref>; <xref ref-type="bibr" rid="B21">2021)</xref>) using the so-called Square Root Velocity metric that derives from the <italic>L</italic>
<sub>2</sub> distance between velocities of the curves (<xref ref-type="bibr" rid="B33">Srivastava et al., 2010</xref>).</p>
</sec>
<sec id="s2-6-3">
<title>2.6.3 HMP dataset</title>
<p>The Human Microbiome Project (HMP) (<xref ref-type="bibr" rid="B36">Turnbaugh et al., 2007</xref>) dataset represents the microbiome measured across thousands of human subjects. The human microbiome corresponds to the set of microorganisms associated to the human body, including the gut flora, or the skin microbiota. The data used here corresponds to the HMP1 phase of clinical production. The hypervariable region v13 of ribosomal RNA was sequenced for each sample, which allowed to identify and count each specific microorganism, called phylotype. The processing and classification were performed by the HMP using MOTHUR, and made available as low quality counts (<ext-link ext-link-type="uri" xlink:href="https://www.hmpdacc.org/hmp/HMMCP/">https://www.hmpdacc.org/hmp/HMMCP/</ext-link>) (<xref ref-type="bibr" rid="B36">Turnbaugh et al., 2007</xref>). We downloaded this dataset, and subsequently, counts were filtered and normalized as previously described (<xref ref-type="bibr" rid="B14">Legrand, 2017</xref>). For our analysis, we also restricted our dataset to samples collected in nose and throat. Samples and phylogenies with less than 10 strictly positive counts were filtered out (<xref ref-type="bibr" rid="B14">Legrand, 2017</xref>), resulting in an <italic>n</italic> &#xd7; <italic>p</italic>-matrix where <italic>n</italic> &#x3d; 270 samples and <italic>p</italic> &#x3d; 425 phylotypes. Next, the data distribution was identified with an exponential distribution, by fitting its rate parameter. Normalization was then achieved by replacing the abundances (counts) with the corresponding quantiles. Lastly, the matrix of pairwise distances was obtained using the Euclidean distance.</p>
</sec>
<sec id="s2-6-4">
<title>2.6.4 scRNA-seq dataset</title>
<p>The scRNA-seq dataset contains single-cell transcriptomic profiles from mouse pancreatic cells (raw count data accession number: GSE84133), which were first processed using standard quality control methods from <xref ref-type="bibr" rid="B18">McCarthy et al. (2017)</xref>. From the gene count matrix, which originally contained 1,886 cells with 13,357 genes, we focused on the cells from Mouse 2, yielding 1,063 cells with 13,357 genes. We further lognormalized the data (<xref ref-type="bibr" rid="B16">Luecken and Theis, 2019</xref>) and selected highly variable genes using <italic>scanpy</italic> package (<xref ref-type="bibr" rid="B37">Wolf et al., 2018</xref>). This procedure resulted in a normalized gene count matrix of 1,603 cells with 2,601 genes. To obtain a matrix of pairwise distances, we used the Euclidean distance.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 Using n-simplices for orthogonal outlier detection and dimensionality reduction</title>
<p>We propose a robust method to reduce and infer the dimensionality of a dataset from its pairwise distance matrix, by detecting and correcting orthogonal outliers. The method, called <italic>DeCOr-MDS</italic>, can be divided into three sub-procedures detailed in <xref ref-type="sec" rid="s2-2">Sections 2.2</xref>&#x2013;<xref ref-type="sec" rid="s2-4">2.4</xref>, with the overall algorithm provided in <xref ref-type="sec" rid="s2-5">Section 2.5</xref>. The first procedure detects orthogonal outliers and estimates the subspace dimension using the statistics of simplices that are sampled from the data, using Eqs <xref ref-type="disp-formula" rid="e4">4</xref>, <xref ref-type="disp-formula" rid="e5">5</xref>. The second procedure corrects for potential bias in estimated dimension when the fraction of outlier is large. The third procedure corrects the pairwise distance of the original data, by replacing the distance to orthogonal outliers by that to their estimated projection on the main subspace. In the next sections, we report the results obtained upon running the procedure on synthetic and various biological datasets, that demonstrate the performance and accuracy of the method. For all these experiments, we also reported the runtime in <xref ref-type="sec" rid="s10">Supplementary Table S1</xref>, showing how the method can be used in practice with reasonable time on experimental datasets (less than 10&#xa0;min in our workstation, with x86_64 CPU, 132&#xa0;GB RAM and 447&#xa0;GB disk storage).</p>
</sec>
<sec id="s3-2">
<title>3.2 Performance on synthetic datasets</title>
<p>We first illustrate and evaluate the performance of the method on synthetic datasets, (for a detailed description of the datasets and their generation, see the Methods <xref ref-type="sec" rid="s2-6">Section 2.6</xref>). On a simple dataset of points forming a 2D cross embedded in 3D (<xref ref-type="fig" rid="F2">Figure 2A</xref>), we observed that the MDS is sensitive to the presence of orthogonal outliers and distorts the cross when reducing the data in 2D (<xref ref-type="fig" rid="F2">Figure 2B</xref>). In contrast, our procedure recovers the original geometry of the uncontaminated dataset, with the outliers being correctly projected (<xref ref-type="fig" rid="F2">Figure 2C</xref>). The same results were obtained when sampling regular points from a 2D plane (<xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>). We further tested higher dimensions, and illustrate in <xref ref-type="fig" rid="F3">Figure 3A</xref> how the distribution of heights becomes concentrated around 0, when testing for the true dimension (<italic>d</italic>&#x2a; &#x3d; 10), as suggested in the Methods <xref ref-type="sec" rid="s2-2">Section 2.2</xref>. As a result, our method allows to infer the main subspace dimension from Eq. <xref ref-type="disp-formula" rid="e4">4</xref>, as shown in <xref ref-type="fig" rid="F3">Figure 3B</xref>. In addition, the procedure accurately corrects the pairwise distances to orthogonal points with the distances to their projections on the main subspace, as shown in <xref ref-type="fig" rid="F3">Figure 3C</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Application of DeCOr-MDS on a synthetic dataset with a main subspace of dimension 10. <bold>(A)</bold> Distribution of median heights per data point <italic>x</italic>
<sub>
<italic>i</italic>
</sub> as a function of the tested dimension <italic>n</italic>. <bold>(B)</bold> Dimensionality inference based on the ratio of median heights (red curve, see also Eq. <xref ref-type="disp-formula" rid="e1">1</xref>), with the optimal ratio (black curve) found for the true dimension 10. <bold>(C)</bold> Shepard diagram comparing the pairwise distances between regular points and outliers that are projected to the main subspace (true <italic>&#x3b4;</italic>
<sub>
<italic>ij</italic>
</sub>), with the same distances obtained after directly running MDS on the original pairwise distance matrix (red dots), or after correcting these distances using our procedure (black dots).</p>
</caption>
<graphic xlink:href="fbinf-03-1211819-g003.tif"/>
</fig>
<p>When the dimension of the subspace and fraction of outliers get significantly large, we also illustrate the importance of the correction step (see Methods <xref ref-type="sec" rid="s2-3">Section 2.3</xref>), due to the sampling of simplices that contain several outliers. Upon using synthetic datasets with <italic>d</italic>&#x2a; &#x3d; 40 and varying the number (fraction) of outliers from 20 (2%) to 100 (10%), we observe this bias appearing before correction, with <italic>d</italic>&#x2a; being overestimated by 2 or 3 dimensions (<xref ref-type="fig" rid="F4">Figure 4</xref>). Using the debiased estimate <italic>n</italic>&#x2a; from Eq. <xref ref-type="disp-formula" rid="e6">6</xref> successfully reduced the bias, with an error <inline-formula id="inf39">
<mml:math id="m45">
<mml:mo>&#x2264;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula> for all the parameters tested.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Application of DeCOr-MDS on a dataset with a main subspace of dimension 40: Dimension correction effect versus the fraction of outliers. The vertical axis represents the remaining bias between the inferred and actual dimensions, before and after bias correction. After correction, the differences between the estimated dimensions and the true dimension are always closer to 0 regardless of the fraction of outliers.</p>
</caption>
<graphic xlink:href="fbinf-03-1211819-g004.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>3.3 Application to cell shape data</title>
<p>We further show how <italic>DeCOr-MDS</italic> can be broadly applied to biological data, ranging from images to high throughput sequencing. We first studied a dataset of single cell images, from osteosarcoma cells (see <xref ref-type="fig" rid="F5">Figure 5A</xref>), which were processed to extract from their contour a 100 &#xd7; 2 array of <italic>xy</italic> coordinates representing a discretization of a closed curve (see Dataset <xref ref-type="sec" rid="s2-6">Section 2.6</xref>). We obtained a pairwise distance matrix on this set of curves by using the so-called <italic>Square Root Velocity</italic> (SRV) metric, which defines a Euclidean distance on the space of velocities that derive from a regular parameterization of the curve (<xref ref-type="bibr" rid="B33">Srivastava et al., 2010</xref>; <xref ref-type="bibr" rid="B22">Miolane et al., 2020</xref>). Using <italic>DeCOr-MDS</italic>, we found a main subspace of dimension 2 (<xref ref-type="fig" rid="F5">Figure 5B</xref>), with 14 (7%) outliers detected among the 207 cells of this dataset. The comparison between the resulting embedding and that obtained from a simple MDS is shown in <xref ref-type="sec" rid="s10">Supplementary Figure S2</xref>, and reveals that outliers, when uncorrected, affect the embedding coordinates, while our correction mitigates it. By examining in more details the regular and inferred outlier cells (<xref ref-type="fig" rid="F5">Figure 5C</xref>, with all cell shapes shown in <xref ref-type="sec" rid="s10">Supplementary Figure S3</xref>), we found regular observations to approximately describe elliptic shapes, which is in agreement with the dimension found, since ellipses are defined by 2 parameters. One can also visually interpret the orthogonal outliers detected as being more irregular, with the presence of more spikes and small protusions. Interestingly, the procedure also identified as outliers some images containing errors, due to bad cropping or segmentation (with 2&#xa0;cells shown instead of one), which should thus be removed of the dataset for downstream analysis.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Application of DeCOr-MDS on a cell shapes dataset. <bold>(A)</bold> An example of osteocarcoma cell image obtained from fluorescence microscopy. We process and extract the cell contour in our analysis. <bold>(B)</bold> Dimensionality inference of the dataset obtained from 207 cell shapes using DeCOr-MDS. We estimate the dimension of the main subspace <italic>n</italic>&#x2a; &#x3d; 2. The red line shows the median of heights and the black curve shows the heights median ratios between <italic>h</italic>
<sub>
<italic>n</italic>&#x2212;1</sub> and <italic>h</italic>
<sub>
<italic>n</italic>
</sub>. <bold>(C)</bold> Examples of cell shapes, including regular cells (in black), and orthogonal outliers detected. Among these outliers, we highlight cell shapes that are likely to be invalid due to segmentation errors (in blue), with the other outliers shown in red.</p>
</caption>
<graphic xlink:href="fbinf-03-1211819-g005.tif"/>
</fig>
</sec>
<sec id="s3-4">
<title>3.4 Application to HMP data</title>
<p>As another example of application to biological data, we next considered a dataset from the Human Microbiome Project (HMP). The Human Microbiome Project aims at describing and studying the microbial contribution to the human body. In particular, genes contributed by microbes in the gut are of primary importance in health and disease (<xref ref-type="bibr" rid="B36">Turnbaugh et al., 2007</xref>). The resulting data is an array which typically contains the abundance of different elements of the microbiome (typically 10<sup>2</sup>&#x2013;10<sup>3</sup>), denoted phylotypes, measured in different human subjects. To analyze such high dimensional datasets, dimensionality reduction methods including MDS (often denoted Principal Coordinates Analysis PCoA), are typically applied and used to visualize the data (<xref ref-type="bibr" rid="B7">Brooks et al., 2018</xref>; <xref ref-type="bibr" rid="B35">Trevelline and Kohl, 2022</xref>; <xref ref-type="bibr" rid="B38">Zhou et al., 2022</xref>).</p>
<p>To assess our method incrementally, we restricted first the analysis to a representative specific site (nose), yielding a 136 &#xd7; 425 array that was further normalized to generate Euclidean pairwise distance matrices (see Material and Methods <xref ref-type="sec" rid="s2-6">Section 2.6</xref> for more details). Upon running <italic>DeCOr-MDS</italic>, we estimated the main dimension to be 3, with 9 (6.62%) orthogonal outliers detected, as shown in <xref ref-type="fig" rid="F6">Figure 6A</xref>. This is also supported by another study that the estimated dimension of HMP dataset is 2 or 3 (<xref ref-type="bibr" rid="B34">Tomassi et al., 2021</xref>). We also computed the average distance between these orthogonal outliers and the barycenter of regular points in the reduced subspace, and obtained a decrease from 1.21 when using <italic>MDS</italic> to 0.91 when using <italic>DeCOr-MDS</italic>. This decrease suggests that orthogonal outliers get corrected and projected closer to the regular points, to improve the visualization of the data in the reduced subspace, like in our experiments with the synthetic datasets (<xref ref-type="fig" rid="F2">Figure 2</xref> and <xref ref-type="sec" rid="s10">Supplementary Figure S2</xref>). In <xref ref-type="fig" rid="F6">Figure 6B</xref>, we next aggregated data points from another site (throat) to study how the method performs in this case, yielding a 270 &#xd7; 425 array that was further normalized to generate Euclidean pairwise distance matrices. As augmenting the dataset brings a separate cluster of data points, the dimension of the main dataset was then estimated to be 2, with 13 (5%) orthogonal outliers detected, as shown in <xref ref-type="fig" rid="F6">Figure 6B</xref>. The average distance between the projected outliers and the barycenter of projected regular points are approximately the same when using <italic>MDS</italic> (1.46) as when using <italic>DeCOr-MDS</italic> (1.45) for nose, and are also approximately the same when using <italic>MDS</italic> (1.75) to when using <italic>DeCOr-MDS</italic> (1.74) for throat. This decrease also suggests that orthogonal outliers get corrected and projected closer to the regular points.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Application of DeCOr-MDS on HMP dataset. <bold>(A)</bold>: Structure restituted on 3 axes using <italic>MDS</italic> (left) and our procedure (right) using data from the nose site. The points marked with cross represent orthogonal outliers detected by <italic>DeCOr-MDS</italic>, which are also put closer to regular points after correction. <bold>(B)</bold> Same comparison as in <bold>(A)</bold> using data from nose and throat. The two clusters formed by nose and throat have a better separation using DeCOr-MDS.</p>
</caption>
<graphic xlink:href="fbinf-03-1211819-g006.tif"/>
</fig>
</sec>
<sec id="s3-5">
<title>3.5 Application to scRNA-seq data</title>
<p>We further evaluated <italic>DeCOr-MDS</italic> on single cell RNA-seq (scRNA-seq) data. In general, analyzing scRNA seq data requires dimensionality reduction for visualization (including MDS-based methods <xref ref-type="bibr" rid="B8">Canzar et al. (2021)</xref>; <xref ref-type="bibr" rid="B28">Senabouth et al. (2019)</xref>), and specific quality control procedures to mitigate various technical artifacts <xref ref-type="bibr" rid="B18">McCarthy et al. (2017)</xref>; <xref ref-type="bibr" rid="B16">Luecken and Theis (2019)</xref>. We applied our method as a potentially relevant tool for this purpose. We applied <italic>DeCOr-MDS</italic> first on a dataset containing the expression level of 1063 cells for 2,601 genes (detailed in Methods <xref ref-type="sec" rid="s2-6">Section 2.6</xref>). We found the dimension of the main subspace to be 3, with 77 (7%) outliers detected. In <xref ref-type="fig" rid="F7">Figures 7A,B</xref>, we compared the embeddings in 3D using <italic>MDS</italic> and <italic>DeCOr-MDS</italic>. Similarly to the previous experiments, the mean distance between the orthogonal outliers and barycenter of regular points in the reduced subspace decreases when using <italic>DeCOr-MDS</italic> (from 4.22 to 2.51), improving the visualization of regular points. In <xref ref-type="fig" rid="F7">Figure 7C</xref>, we further examined the drop-out rates (indicating zero count for a given gene) of the cells among the top 500highly expressed genes, determined by the median of counts per gene. Among these highly expressed genes, we identified 97.4% of the detected outliers that have drop-out rates greater than 0.95, while this was the case for 27.4% of the regular cells. Upon performing a pairwise <italic>t</italic>-test on the total counts for the top 500 highly expressed genes from the outlier group and the regular cell group, we found that the total counts are significantly different between the two groups (<italic>p</italic>-value <inline-formula id="inf40">
<mml:math id="m46">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula> 0.001). Therefore, our method led to detect some outliers associated with high drop-out counts for highly expressed genes, which were not captured by the standard processing and quality control methods used in the first place.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Application of <italic>DeCOr-MDS</italic> on the scRNA-seq dataset. Structure restituted on 3 axes using <italic>MDS</italic> <bold>(A)</bold>, and using <italic>DeCOr-MDS</italic> <bold>(B)</bold>. The red points in <bold>(A)</bold> and <bold>(B)</bold> are outliers detected by <italic>DeCOr-MDS</italic>. The blue points are regular points. <bold>(C)</bold> Violin plot for the drop-out rates for the scRNA-seq dataset, for the top 500 highly expressed genes. Drop-out rates for outliers detected by <italic>DeCOr-MDS</italic> are shown in red.</p>
</caption>
<graphic xlink:href="fbinf-03-1211819-g007.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>We proposed <italic>DeCOr-MDS</italic>, a novel approach using geometric characteristics to detect dimension, and to correct orthogonal outliers in high dimensional space. That is, to the best of our knowledge, the first statistical tool that addresses the challenge of the presence of orthogonal outliers in high dimensional space. We validated the method using synthetic datasets and demonstrated its potential applications to analyze biological datasets, including cell shape data, count arrays from microbiome data and scRNA-seq data. The visualization and numerical comparison confirmed that <italic>DeCOr-MDS</italic> effectively detects dimensionalities in many instances, corrects orthogonal outliers, and demonstrates superior performance to classical dimension reduction methods.</p>
<p>The notion of simplices is used frequently with the aim of robustness, either to detect the coreness of data [data depth and multivariate median, <xref ref-type="bibr" rid="B15">Liu (1990)</xref>; <xref ref-type="bibr" rid="B1">Aamari et al. (2021)</xref>], or to detect outlying features [detection of extreme directions, <xref ref-type="bibr" rid="B20">Meyer and Wintenberger (2021)</xref>]. Simplices can also be used to build a flexible network of points for informative visualization (<xref ref-type="bibr" rid="B19">McInnes et al., 2018</xref>). Outlier detection and accommodation have been addressed by a wide array of methods, which can be broadly divided into three categories: 1) robust metrics (<xref ref-type="bibr" rid="B32">Spence and Lewandowsky, 1989</xref>; <xref ref-type="bibr" rid="B9">Cayton and Dasgupta, 2006</xref>; <xref ref-type="bibr" rid="B24">Oh et al., 2008</xref>; <xref ref-type="bibr" rid="B29">Shieh and Hung, 2009</xref>; <xref ref-type="bibr" rid="B12">Forero and Giannakis, 2012</xref>), 2) robust estimation (<xref ref-type="bibr" rid="B17">Mandanas and Kotropoulos, 2017</xref>), or 3) exploiting the characteristics of outliers (<xref ref-type="bibr" rid="B12">Forero and Giannakis, 2012</xref>; <xref ref-type="bibr" rid="B6">Blouvshtein and Cohen-Or, 2019</xref>). Our method resorts to both (3) by using the geometry of data, and 1) by using the median as centrality estimator. Our method also aims at estimating dimension. A common approach to do so is the screeplot (or elbow) test in principal components analysis, where a notable drop in the proportion of variance (or distance) explained can be taken as a cutoff, and as the most relevant dimension. High-dimensional biological datasets challenge this strategy, because fine-scale structure confounds in practice downstream analyses. Because of this, authors often use an arbitrary large set of 10, or sometimes 20 or 50 components (<xref ref-type="bibr" rid="B4">Astle and Balding, 2009</xref>; <xref ref-type="bibr" rid="B5">Barfield et al., 2014</xref>; <xref ref-type="bibr" rid="B10">Demmitt et al., 2017</xref>; <xref ref-type="bibr" rid="B27">Sakaue et al., 2020</xref>; <xref ref-type="bibr" rid="B3">Arciero et al., 2021</xref>; <xref ref-type="bibr" rid="B11">Deng et al., 2021</xref>). Power analyses based on simulations also provide a way to assess an adequate number of components (<xref ref-type="bibr" rid="B5">Barfield et al., 2014</xref>). In this work, we proposed an alternative approach, by exploiting the structure of the dataset to determine essential versus non-essential dimensions.</p>
<p>Limitations of DeCOr-MDS include the non-automated choice of the cutoff parameter <italic>c</italic>. This parameter sets the maximum tolerated number of standard deviations <italic>&#x3c3;</italic> before a point is considered an outlier. A value for <italic>c</italic> &#x3d; 3, which corresponds approximately to the 0.1% most extreme points in a Gaussian distribution, may be selected, for instance. Dimension detection is also imperfect for heterogeneous datasets where the distribution of regular points (e.g. with distant clusters may prevent the height criterion for outlier detection to be effective. In this case a possible solution would be to first perform a clustering analysis (for instance k-means) to assess if the distance between clusters is comparable with the distance between the outlier and the main subspace, and if that&#x2019;s the case separately perform our method on each cluster. There are various potential directions to improve the dimension detection in real datasets of high dimension. This may be achieved by studying the behaviour of the Cayley-Menger determinant, which is central in the procedure, in higher dimensions. One may also associate the height criterion with a distribution criterion (<xref ref-type="bibr" rid="B14">Legrand, 2017</xref>), which would be sensitive to clusters or other notable structure, as was apparent in the HMP dataset. Another beneficial improvement would be to reduce computing time, for instance by implementing a parallelized version or using a call to a compiled program. Finally, one could optimize the cutoff parameter <italic>c</italic> automatically, either through a hyperparameter search, or by using a data-driven procedure, during the exploration phase of the algorithm.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://osf.io/x5796">https://osf.io/x5796</ext-link>.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>Conceptualization: CL and KDD, data curation: WL, JM, NM, AP, CL, and KDD, funding acquisition: CL and KDD, formal analysis: JM, CL, and KDD, investigation: WL and JM, methodology: CL and KDD, resources: AP, CL, and KDD, software: WL, JM, NM, CL, and KDD, supervision: CL and KDD, validation: WL, visualization: WL, JM, CL, and KDD, writing&#x2013;original draft: WL, JM, CL, and KDD, writing&#x2013;review and editing: WL, CL, and KDD. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>This research was supported by a NSERC Discovery grant (PG 22R3468) and a MITACS PIMS fellowship.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fbinf.2023.1211819/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fbinf.2023.1211819/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Aamari</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Arias-Castro</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Berenfeld</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>From graph centrality to data depth</article-title>. <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2105.03122">https://arxiv.org/abs/2105.03122</ext-link>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alizadeh</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Castle</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Foss</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Prasad</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Tismorph: A tool to quantify texture, irregularity and spreading of single cells</article-title>. <source>PLoS One</source> <volume>14</volume>, <fpage>e0217346</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0217346</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arciero</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Dogra</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Malawsky</surname>
<given-names>D. S.</given-names>
</name>
<name>
<surname>Mezzavilla</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tsismentzoglou</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Q. Q.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Fine-scale population structure and demographic history of british pakistanis</article-title>. <source>Nat. Commun.</source> <volume>12</volume> (<issue>1</issue>), <fpage>7189</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-021-27394-2</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Astle</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Balding</surname>
<given-names>D. J.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Population structure and cryptic relatedness in genetic association studies</article-title>. <source>Stat. Sci.</source> <volume>24</volume>, <fpage>451</fpage>&#x2013;<lpage>471</lpage>. <pub-id pub-id-type="doi">10.1214/09-sts307</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barfield</surname>
<given-names>R. T.</given-names>
</name>
<name>
<surname>Almli</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Kilaru</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Mercer</surname>
<given-names>K. B.</given-names>
</name>
<name>
<surname>Duncan</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Accounting for population stratification in dna methylation studies</article-title>. <source>Genet. Epidemiol.</source> <volume>38</volume>, <fpage>231</fpage>&#x2013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1002/gepi.21789</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Blouvshtein</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Cohen-Or</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Outlier detection for robust multi dimensional scaling</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>41</volume>, <fpage>2273</fpage>&#x2013;<lpage>2279</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2018.2851513</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brooks</surname>
<given-names>A. W.</given-names>
</name>
<name>
<surname>Priya</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Blekhman</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bordenstein</surname>
<given-names>S. R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Gut microbiota diversity across ethnicities in the United States</article-title>. <source>PLoS Biol.</source> <volume>16</volume>, <fpage>e2006842</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pbio.2006842</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Canzar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Do</surname>
<given-names>V. H.</given-names>
</name>
<name>
<surname>Jeli&#x107;</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Laue</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Matijevi&#x107;</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Prusina</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Metric multidimensional scaling for large single-cell data sets using neural networks</article-title>. <source>bioRxiv</source>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="other">
<person-group person-group-type="author">
<name>
<surname>Cayton</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Dasgupta</surname>
<given-names>S.</given-names>
</name>
</person-group> &#x201c;<article-title>Robust euclidean embedding</article-title>,&#x201d; in <conf-name>Proceedings of the 23rd International Conference on Machine Learning</conf-name>. Editors <person-group person-group-type="editor">
<name>
<surname>Cohen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>A.</given-names>
</name>
</person-group>, <fpage>169</fpage>&#x2013;<lpage>176</lpage>. <conf-date>June 2006</conf-date>, <conf-loc>Pittsburgh, Pennsylvania, USA</conf-loc>, <pub-id pub-id-type="doi">10.1145/1143844.1143866</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Demmitt</surname>
<given-names>B. A.</given-names>
</name>
<name>
<surname>Corley</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Huibregtse</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Keller</surname>
<given-names>M. C.</given-names>
</name>
<name>
<surname>Hewitt</surname>
<given-names>J. K.</given-names>
</name>
<name>
<surname>McQueen</surname>
<given-names>M. B.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Genetic influences on the human oral microbiome</article-title>. <source>BMC Genomics</source> <volume>18</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1186/s12864-017-4008-8</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Caddell</surname>
<given-names>D. F.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Dahlen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Washington</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Genome wide association study reveals plant loci controlling heritability of the rhizosphere microbiome</article-title>. <source>ISME J.</source> <volume>15</volume>, <fpage>3181</fpage>&#x2013;<lpage>3194</lpage>. <pub-id pub-id-type="doi">10.1038/s41396-021-00993-z</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Forero</surname>
<given-names>P. A.</given-names>
</name>
<name>
<surname>Giannakis</surname>
<given-names>G. B.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Sparsity-exploiting robust multidimensional scaling</article-title>. <source>IEEE Trans. Signal Process.</source> <volume>60</volume>, <fpage>4118</fpage>&#x2013;<lpage>4134</lpage>. <pub-id pub-id-type="doi">10.1109/tsp.2012.2197617</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Harmeling</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dornhege</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Tax</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Meinecke</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>K.-R.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>From outliers to prototypes: Ordering data</article-title>. <source>Neurocomputing</source> <volume>69</volume>, <fpage>1608</fpage>&#x2013;<lpage>1618</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2005.05.015</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Legrand</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Exploring and controlling for underlying structure in genome and microbiome case-control association studies</article-title>, <comment>Ph.D. thesis</comment>. <publisher-loc>Heidelberg, Germany</publisher-loc>: <publisher-name>University of Heidelberg</publisher-name>.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>R. Y.</given-names>
</name>
</person-group> (<year>1990</year>). <article-title>On a notion of data depth based on random simplices</article-title>. <source>Ann. Statistics</source> <volume>18</volume>, <fpage>405</fpage>&#x2013;<lpage>414</lpage>. <pub-id pub-id-type="doi">10.1214/aos/1176347507</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luecken</surname>
<given-names>M. D.</given-names>
</name>
<name>
<surname>Theis</surname>
<given-names>F. J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Current best practices in single-cell rna-seq analysis: A tutorial</article-title>. <source>Mol. Syst. Biol.</source> <volume>15</volume>, <fpage>e8746</fpage>. <pub-id pub-id-type="doi">10.15252/msb.20188746</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mandanas</surname>
<given-names>F. D.</given-names>
</name>
<name>
<surname>Kotropoulos</surname>
<given-names>C. L.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Robust multidimensional scaling using a maximum correntropy criterion</article-title>. <source>IEEE Trans. Signal Process.</source> <volume>65</volume>, <fpage>919</fpage>&#x2013;<lpage>932</lpage>. <pub-id pub-id-type="doi">10.1109/tsp.2016.2625265</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McCarthy</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Campbell</surname>
<given-names>K. R.</given-names>
</name>
<name>
<surname>Lun</surname>
<given-names>A. T.</given-names>
</name>
<name>
<surname>Wills</surname>
<given-names>Q. F.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Scater: Pre-processing, quality control, normalization and visualization of single-cell rna-seq data in r</article-title>. <source>Bioinformatics</source> <volume>33</volume>, <fpage>1179</fpage>&#x2013;<lpage>1186</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btw777</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>McInnes</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Healy</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Melville</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Umap: Uniform manifold approximation and projection for dimension reduction</article-title>. <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1802.03426">https://arxiv.org/abs/1802.03426</ext-link>.</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meyer</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wintenberger</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Sparse regular variation</article-title>. <source>Adv. Appl. Probab.</source> <volume>53</volume>, <fpage>1115</fpage>&#x2013;<lpage>1148</lpage>. <pub-id pub-id-type="doi">10.1017/apr.2021.14</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Miolane</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Caorsi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lupo</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Guerard</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Guigui</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mathe</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Iclr 2021 challenge for computational geometry &#x26; topology: Design and results</article-title>. <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2108.09810">https://arxiv.org/abs/2108.09810</ext-link>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miolane</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Guigui</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Le Brigant</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mathe</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Thanwerdas</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Geomstats: A python package for riemannian geometry in machine learning</article-title>. <source>J. Mach. Learn. Res.</source> <volume>21</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Oh</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rosenblatt</surname>
<given-names>K.</given-names>
</name>
</person-group> &#x201c;<article-title>Biological data outlier detection based on kullback-leibler divergence</article-title>,&#x201d; in <conf-name>Proceedings of the 2008 IEEE International Conference on Bioinformatics and Biomedicine (IEEE)</conf-name>, <conf-loc>Philadelphia, PA, USA</conf-loc>, <conf-date>November 2008</conf-date>, <fpage>249</fpage>&#x2013;<lpage>254</lpage>.</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pukelsheim</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>1994</year>). <article-title>The three sigma rule</article-title>. <source>Am. Statistician</source> <volume>48</volume>, <fpage>88</fpage>&#x2013;<lpage>91</lpage>. <pub-id pub-id-type="doi">10.2307/2684253</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rider</surname>
<given-names>P. R.</given-names>
</name>
</person-group> (<year>1960</year>). <article-title>Variance of the median of small samples from several special populations</article-title>. <source>J. Am. Stat. Assoc.</source> <volume>55</volume>, <fpage>148</fpage>&#x2013;<lpage>150</lpage>. <pub-id pub-id-type="doi">10.1080/01621459.1960.10482056</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sakaue</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hirata</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kanai</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Suzuki</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Akiyama</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lai Too</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Dimensionality reduction reveals fine-scale structure in the Japanese population with consequences for polygenic risk prediction</article-title>. <source>Nat. Commun.</source> <volume>11</volume>, <fpage>1569</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-020-15194-z</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Senabouth</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lukowski</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>Hernandez</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Andersen</surname>
<given-names>S. B.</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>Q. H.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>ascend: R package for analysis of single-cell rna-seq data</article-title>. <source>GigaScience</source> <volume>8</volume>, <fpage>giz087</fpage>. <pub-id pub-id-type="doi">10.1093/gigascience/giz087</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shieh</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Hung</surname>
<given-names>Y. S.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Detecting outlier samples in microarray data</article-title>. <source>Stat. Appl. Genet. Mol. Biol.</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.2202/1544-6115.1426</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sommerville</surname>
<given-names>D. M. Y.</given-names>
</name>
</person-group> (<year>1929</year>). <source>An introduction to the geometry of n dimensions</source>. <publisher-loc>Mineola, New York, United States</publisher-loc>: <publisher-name>Dover Publications</publisher-name>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jermaine</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ranka</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Conditional anomaly detection</article-title>. <source>IEEE Trans. Knowl. Data Eng.</source> <volume>19</volume>, <fpage>631</fpage>&#x2013;<lpage>645</lpage>. <pub-id pub-id-type="doi">10.1109/tkde.2007.1009</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Spence</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Lewandowsky</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>1989</year>). <article-title>Robust multidimensional scaling</article-title>. <source>Psychometrika</source> <volume>54</volume>, <fpage>501</fpage>&#x2013;<lpage>513</lpage>. <pub-id pub-id-type="doi">10.1007/bf02294632</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Srivastava</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Klassen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Joshi</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Jermyn</surname>
<given-names>I. H.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Shape analysis of elastic curves in euclidean spaces</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>33</volume>, <fpage>1415</fpage>&#x2013;<lpage>1428</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2010.184</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tomassi</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Forzani</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Duarte</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pfeiffer</surname>
<given-names>R. M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Sufficient dimension reduction for compositional data</article-title>. <source>Biostatistics</source> <volume>22</volume>, <fpage>687</fpage>&#x2013;<lpage>705</lpage>. <pub-id pub-id-type="doi">10.1093/biostatistics/kxz060</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trevelline</surname>
<given-names>B. K.</given-names>
</name>
<name>
<surname>Kohl</surname>
<given-names>K. D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>The gut microbiome influences host diet selection behavior</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>119</volume>, <fpage>e2117537119</fpage>. <pub-id pub-id-type="doi">10.1073/pnas.2117537119</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Turnbaugh</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Ley</surname>
<given-names>R. E.</given-names>
</name>
<name>
<surname>Hamady</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fraser-Liggett</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Knight</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gordon</surname>
<given-names>J. I.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>The human microbiome project</article-title>. <source>Nature</source> <volume>449</volume>, <fpage>804</fpage>&#x2013;<lpage>810</lpage>. <pub-id pub-id-type="doi">10.1038/nature06244</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wolf</surname>
<given-names>F. A.</given-names>
</name>
<name>
<surname>Angerer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Theis</surname>
<given-names>F. J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Scanpy: Large-scale single-cell gene expression data analysis</article-title>. <source>Genome Biol.</source> <volume>19</volume>, <fpage>15</fpage>&#x2013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1186/s13059-017-1382-0</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Gut microbiome mediates the protective effects of exercise after myocardial infarction</article-title>. <source>Microbiome</source> <volume>10</volume>, <fpage>82</fpage>&#x2013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1186/s40168-022-01271-6</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>