<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Physiol.</journal-id>
<journal-title>Frontiers in Physiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Physiol.</abbrev-journal-title>
<issn pub-type="epub">1664-042X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1233341</article-id>
<article-id pub-id-type="doi">10.3389/fphys.2023.1233341</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Outlier detection using iterative adaptive mini-minimum spanning tree generation with applications on medical data</article-title>
<alt-title alt-title-type="left-running-head">Li et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphys.%202023.1233341">10.3389/fphys. 2023.1233341</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Jia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2333515/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Jiangwei</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Chenxu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Verbeek</surname>
<given-names>Fons J.</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1220799/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Schultz</surname>
<given-names>Tanja</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/748491/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Liu</surname>
<given-names>Hui</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1257413/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Software Engineering</institution>, <institution>Xi&#x2019;an Jiaotong University</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Leiden Institute of Advanced Computer Science</institution>, <institution>Leiden University</institution>, <addr-line>Leiden</addr-line>, <country>Netherlands</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Geriatric Surgery</institution>, <institution>The Second Affiliated Hospital of Xi&#x2019;an Jiaotong University</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>MOE Key Lab of Intelligent Network and Network Security</institution>, <institution>Xi&#x2019;an Jiaotong University</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Cognitive Systems Lab</institution>, <institution>University of Bremen</institution>, <addr-line>Bremen</addr-line>, <country>Germany</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1096669/overview">Michael D&#xf6;llinger</ext-link>, University Hospital Erlangen, Germany</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1210933/overview">Stefan Schoder</ext-link>, Graz University of Technology, Austria</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1023498/overview">Eamonn John Keogh</ext-link>, University of California, Riverside, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Fons J. Verbeek, <email>f.j.verbeek@liacs.leidenuniv.nl</email>; Hui Liu, <email>hui.liu@uni-bremen.de</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>13</day>
<month>10</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1233341</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>06</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>09</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Li, Li, Wang, Verbeek, Schultz and Liu.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Li, Li, Wang, Verbeek, Schultz and Liu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>As an important technique for data pre-processing, outlier detection plays a crucial role in various real applications and has gained substantial attention, especially in medical fields. Despite the importance of outlier detection, many existing methods are vulnerable to the distribution of outliers and require prior knowledge, such as the outlier proportion. To address this problem to some extent, this article proposes an adaptive mini-minimum spanning tree-based outlier detection (MMOD) method, which utilizes a novel distance measure by scaling the Euclidean distance. For datasets containing different densities and taking on different shapes, our method can identify outliers without prior knowledge of outlier percentages. The results on both real-world medical data corpora and intuitive synthetic datasets demonstrate the effectiveness of the proposed method compared to state-of-the-art methods.</p>
</abstract>
<kwd-group>
<kwd>minimum spanning tree</kwd>
<kwd>outlier detection</kwd>
<kwd>cluster-based outlier detection</kwd>
<kwd>data mining</kwd>
<kwd>medical data</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Physiology and Medicine</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Massive and complex databases often contain numerous patterns. Most traditional data mining tasks find general patterns in the datasets and regard the outliers as noise, such as frequent pattern mining, classification, and clustering. What should not be overlooked is that outliers may embody more valuable information than general patterns, as they could imply abnormal behaviors or potential new patterns, which is consistent with real-life situations <xref ref-type="bibr" rid="B24">Liu and Schultz (2022)</xref>. An outlier generally means a point that deviates greatly from others, typically generated by a different mechanism <xref ref-type="bibr" rid="B4">Atkinson and Hawkins (1980)</xref>. Detecting outliers in a dataset is critical and beneficial for practical applications in various fields, such as fraud detection <xref ref-type="bibr" rid="B11">Fiore et al. (2019)</xref>; <xref ref-type="bibr" rid="B35">Tseng et al. (2015)</xref>, cyber-security, medical diagnostics <xref ref-type="bibr" rid="B33">Schlegl et al. (2017)</xref>; <xref ref-type="bibr" rid="B40">Zhang et al. (2016)</xref>, and others <xref ref-type="bibr" rid="B16">Kang et al. (2016)</xref>. Outliers of physiological signals in the form of time series are often studied by statistical models, with the latest examples including self-similarity matrices <xref ref-type="bibr" rid="B32">Rodrigues et al. (2022)</xref> and subsequence search <xref ref-type="bibr" rid="B12">Folgado et al. (2022)</xref>, while graph theory-based outlier detection algorithms shine in medical data composed of discrete points, the subject of this article.</p>
<p>Research on outlier detection has a long tradition. Following Hawkins&#x2019; classical definition of outliers <xref ref-type="bibr" rid="B4">Atkinson and Hawkins (1980)</xref>, researchers have developed various outlier detection algorithms and schemes over the years. Generally speaking, these approaches fall into four major groups: distribution-based <xref ref-type="bibr" rid="B43">Zong et al. (2018)</xref>, distance-based <xref ref-type="bibr" rid="B1">Amagata et al. (2021)</xref>; <xref ref-type="bibr" rid="B30">Radovanovi&#x107; et al. (2015)</xref>, density-based <xref ref-type="bibr" rid="B34">Schubert et al. (2014)</xref>; <xref ref-type="bibr" rid="B8">Corain et al. (2021)</xref>, and clustering-based <xref ref-type="bibr" rid="B26">Manzoor et al. (2016)</xref>; <xref ref-type="bibr" rid="B7">Chawla and Gionis (2013)</xref>; <xref ref-type="bibr" rid="B38">Wang et al. (2019)</xref>. The main characteristic of the distribution-based method is that it fits datasets with a standard distribution, assuming that the underlying distribution of the dataset is known in advance. It identifies the outliers as the points that do not conform to a particular distribution that sums up most of the data points. Although effective for datasets with a known distribution, the distribution-based approach is not always advisable for real-world scenarios due to the unavailability of <italic>a priori</italic> distribution knowledge and the high cost of concluding an appropriate distribution <xref ref-type="bibr" rid="B21">Li et al. (2022)</xref>. During the past 2 decades, distance-based methods have attracted much attention, finding points whose given distance range of neighbors contains less than a predetermined percentage of points of the whole dataset <xref ref-type="bibr" rid="B17">Knorr and Ng (1998)</xref>. In addition to the unavoidable computational expense of the distances between all pairs, the configuration of the neighboring amount <italic>k</italic> significantly influences the detection quality. The density-based algorithm was proposed to cover the shortcoming of distance-based approaches, which often fail to detect local outliers. The local outlier factor (LOF) proposed by Markus is widely used to evaluate the outsiderness degree of a point <xref ref-type="bibr" rid="B14">Jahanbegloo and Jahanbegloo (2000)</xref>, performing well in the dataset with different density distributions. LOF measures the difference between the samples&#x2019; local density and their <italic>k</italic>-nearest neighbors (<italic>k</italic>-NN) as the outlier factor. However, the choice of <italic>k</italic> can greatly influence performance. Clustering-based methods have gained popularity in the field of outlier detection as they can overcome the influence of parameters. Clustering divides the dataset into several clusters, making the intra-cluster distance much smaller than the inter-cluster distance. Outliers are identified as the points that are isolated from the resulted clusters. Many researchers have focused on combining clustering and outlier detection <xref ref-type="bibr" rid="B38">Wang et al. (2019)</xref>; <xref ref-type="bibr" rid="B9">Degirmenci and Karal (2022)</xref>; <xref ref-type="bibr" rid="B23">Liu et al. (2019)</xref>. Clustering based on minimum spanning trees (MSTs) is widely adopted for its ability to identify clusters with irregular boundaries <xref ref-type="bibr" rid="B36">Wang et al. (2013)</xref>. Unlike <italic>k</italic>-means, there is no assumption that the data points are grouped around centers or separated by a regular geometric curve. However, building an MST is time-consuming for large datasets and may not detect different density clusters effectively <xref ref-type="bibr" rid="B20">Li et al. (2019)</xref>.</p>
<p>This article proposes a novel outlier detection method, called Mini-MST-based Outlier Detection (MMOD), which does not require specifying the number of outliers. For the emerging real-world data without ground truth, sometimes called black-box data, algorithms that do not require a predetermined number or proportion of outliers can often be straightforwardly plug-and-play. Our approach uses a new distance measure as the edge weight of MST, to better differentiate the clusters so that the outliers in datasets with various density clusters can be identified. To improve the efficiency, we compute some mini-MSTs with a small proportion of the whole dataset and delete the points added to the trees. Our method starts with constructing a Prim&#x2019;s MST to find one data point in the densest cluster. Subsequently, some small mini-MSTs are computed from the densest point using a distance scaled by the termination threshold of Prim&#x2019;s algorithm instead of the traditional Euclidean distance to represent the edge weight. The points in each mini-MST can be regarded as a cluster. We compute a termination condition for the MST construction so that the remaining points are outliers after all the mini-MSTs are constructed. The novelty of the proposed method includes a new distance measure to construct the MST to identify different density clusters and efficiency enhancement by employing the mini-MST structure and deleting the data points while constructing the trees. Compared with eight state-of-the-art outlier detection methods on various real-world medical datasets and five synthetic datasets, our method&#x2019;s feasibility and effectiveness will be proven.</p>
<p>The remainder of the article is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> discusses relevant work on outlier detection. <xref ref-type="sec" rid="s3">Section 3</xref> prepares the foundations of the preliminaries and definitions for subsequent tasks. <xref ref-type="sec" rid="s4">Section 4</xref> presents our mini-MST-based outlier detection method. <xref ref-type="sec" rid="s5">Section 5</xref> manifests the experimental results in comparison to the state-of-the-art technologies. <xref ref-type="sec" rid="s6">Section 6</xref> concludes our work and looks into the future.</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<sec id="s2-1">
<title>2.1 Distance-based outlier detection</title>
<p>Knorr and Ng advocated distance-based outlier detection (DOD) for the first time to soften the limitation of distribution-based methods on data distribution and prior information <xref ref-type="bibr" rid="B18">Knox and Ng (1998)</xref>. The local distance-based outlier factor (LDOF) is one of the most known variants in distance-based approaches <xref ref-type="bibr" rid="B39">Zhang et al. (2009)</xref>, which measures the outsiderness degree in scattered real-world datasets. The relative location of one point and its neighbors evaluates the deviation of the patterns, based on which the classical top-<italic>n</italic> strategy chooses outlier candidates. As the volume of data increases and the form of data diversifies, data streams are becoming popular, spawning many studies on in-stream outlier detection. Angiulli et al. presented three algorithms to detect distance-based outliers in a sliding-window model <xref ref-type="bibr" rid="B3">Angiulli and Fassetti (2010)</xref>. A novel notion called the one-time outlier query identifies outliers in a targeted window at an arbitrary time. Milos Radovanovic et al., focusing on the effects of high-dimensional datasets, analyzed the relationship between antihubs and outliers taking into account the reverse nearest neighbor, that is, the point neighboring its <italic>k</italic>-NN <xref ref-type="bibr" rid="B30">Radovanovi&#x107; et al. (2015)</xref>. Continuous outlier mining employs the sliding-window data structure to reduce time and memory costs, which is flexible in terms of input parameters <xref ref-type="bibr" rid="B19">Kontaki et al. (2016)</xref>. Scaleable, distributed algorithms have been put forward for substantial data. MapReduce works for distributed tasks: A multi-tactic strategy for DOD is proposed, where data characteristics are considered in data partitioning <xref ref-type="bibr" rid="B6">Cao et al. (2017)</xref>. The in-memory proximity graph copes with the memory problem of large datasets, analyzing the type of proximity graph for the algorithm <xref ref-type="bibr" rid="B2">Amagata et al. (2022)</xref>.</p>
</sec>
<sec id="s2-2">
<title>2.2 Minimum spanning tree-based outlier detection</title>
<p>MST is an important and widely used data structure in clustering analysis. MST-based clustering finds inconsistent edges and deletes them to form reasonable and meaningful clusters. In the case of the existence of outliers, cutting inconsistent edges can result in isolated points or clusters, which can be utilized for outlier detection.</p>
<p>Jiang et al. proposed a two-phase outlier detection method based on <italic>k</italic>-means and MST, in which small clusters are selected and deemed outliers <xref ref-type="bibr" rid="B15">Jiang et al. (2001)</xref>. There are two stages to this method. In the first phase, they used modified <italic>k</italic>-means clustering by assigning the far point as a new cluster center. In the second phase, an MST is constructed, and the longest edges are cut to find the small clusters, the tree with a few nodes. MST-based spatial outlier detection combines MST-based clustering constructed by the Delaunay triangle irregular net (D-TIN) and density-based outlier detection, performing effectively on the data of soil chemical elements <xref ref-type="bibr" rid="B22">Lin et al. (2008)</xref>. Previous work also modified the <italic>k</italic>-means algorithm to construct a spanning tree efficiently <xref ref-type="bibr" rid="B37">Wang et al. (2012)</xref>. Integrating MST-based clustering and density-based outlier detection improves the quality of detection. Meanwhile, the removal of outliers may lead to enhanced results of MST-based clustering <xref ref-type="bibr" rid="B36">Wang et al. (2013)</xref>.</p>
</sec>
<sec id="s2-3">
<title>2.3 Summary of deficiencies</title>
<p>From the existing work in outlier detection, it can be concluded that.<list list-type="simple">
<list-item>
<p>&#x2022; Distance-based models are weak in detecting local outliers. Furthermore, the boundary points in a sparse cluster may be misclassified as outliers.</p>
</list-item>
<list-item>
<p>&#x2022; Density-based models are less effective at identifying global outliers because these outliers are usually scored low.</p>
</list-item>
<list-item>
<p>&#x2022; Clustering-based models, ignoring the locations and conditions, can identify outliers that do not belong to any cluster, but are not robust to the presence of different density clusters.</p>
</list-item>
</list>
</p>
<p>We propose a novel method inspired by MST to tackle the shortcomings mentioned above.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Foundation</title>
<sec id="s3-1">
<title>3.1 Preliminaries</title>
<p>Spanning tree. Given <italic>N n</italic>-dimensional data points (vertices) in Euclidean space, the spanning tree is a tree that includes all <italic>N</italic> vertices without closed loops, in which the number of edges is not greater than <inline-formula id="inf1">
<mml:math id="m1">
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula> because full connectivity is not required.</p>
<p>Minimum spanning tree (MST). An MST is a spanning tree whose total weight is minimal among all spanning trees, which means that the number of edges in an MST is <italic>N</italic> &#x2212; 1. The total weight is the sum of the weight of all edges of the tree. Generally speaking, the weight of an edge in a tree is the Euclidean distance between its two endpoints. Mahalanobis distance or other metrics can also be used as a measure.</p>
<p>Prim&#x2019;s MST. Among the three traditional algorithms for constructing MST, Prim, Kruskal, and Boruvka, this work employs Prim <xref ref-type="bibr" rid="B27">Medak (2018)</xref>, whose process can be briefly described as.<list list-type="simple">
<list-item>
<p>&#x2022; Randomly choose one point in the dataset as the root of the tree;</p>
</list-item>
<list-item>
<p>&#x2022; Compute the pairwise distances between the chosen point and other points to find the shortest edge;</p>
</list-item>
<list-item>
<p>&#x2022; Add the shortest edge and the other endpoint of it to the tree;</p>
</list-item>
<list-item>
<p>&#x2022; Repeat the steps above until all the data points are added to the tree.</p>
</list-item>
</list>
</p>
<p>Euclidean distance (<italic>d</italic>). Given two endpoints <italic>x</italic>
<sub>1</sub> and <italic>x</italic>
<sub>2</sub> of the <italic>ith</italic> edge <italic>e</italic>
<sub>
<italic>i</italic>
</sub> of an MST in the <italic>n</italic>-dimensional Euclidean space, the Euclidean distance between <italic>x</italic>
<sub>1</sub> and <italic>x</italic>
<sub>2</sub> is<disp-formula id="e1">
<mml:math id="m2">
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>.</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
</sec>
<sec id="s3-2">
<title>3.2 Definitions</title>
<p>Threshold of termination (<italic>T</italic>
<sub>
<italic>t</italic>
</sub>). A global termination threshold sets the stopping condition of the cluster computation to identify the remaining points as outliers, defined as<disp-formula id="e2">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf2">
<mml:math id="m4">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is the average weight of all edges {<italic>e</italic>
<sub>
<italic>i</italic>
</sub>} in the Prim&#x2019;s MST constructed of the dataset, computed as<disp-formula id="e3">
<mml:math id="m5">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>where the numerator accumulates all edges in the Prim&#x2019;s MST.</p>
<p>Threshold-based Euclidean distance (<italic>ted</italic>). We put forward a weighted Euclidean distance to replace the traditional Euclidean, computed as<disp-formula id="e4">
<mml:math id="m6">
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>
<italic>T</italic>
<sub>
<italic>t</italic>
</sub> is calculated based on all edges from the MST of the entire dataset, which enables the scaled distances to handle different density clusters by reducing the discrepancy of the edge weights.</p>
<p>Mini-MST generation. In this work, the mini-MST generation algorithm starts from a point in the densest cluster and computes the MST using <italic>ted</italic>. When an edge is supposed to be added to the tree, its weight is first compared to the adaptive exit condition defined below. If the former is greater, the other end of the current edge does not belong to the current cluster. Consequently, the computation of the current mini-MST terminates and a new construction starts.</p>
<p>Mini-edge weight set (<italic>MEW</italic>). A mini-edge weight set records the weight of the edges added to the mini-MST. Once an edge is added to the MST, its weight enters <italic>MEW</italic>.</p>
<p>The first value added to <italic>MEW</italic>, denoted as <italic>MEW</italic>
<sub>1</sub>, defaults to <italic>d</italic>
<sub>1</sub>, the length of the first edge added to mini-MST. The default value performs well on all real-world datasets applied in this work, as <xref ref-type="sec" rid="s5-2">Section 5.2</xref> manifests. In exceptional cases, like a significantly high value of <italic>d</italic>
<sub>1</sub>, <italic>MEW</italic>
<sub>1</sub> can be tuned, such as for the synthetic &#x201c;Two densities&#x201d; and &#x201c;Three clusters&#x201d; datasets in <xref ref-type="sec" rid="s12">Appendix</xref>, where <italic>MEW</italic>
<sub>1</sub> was set to 1.</p>
<p>Adaptive exit condition of mini-MST generation (aec). To improve efficiency, we repeatedly compute mini-MSTs and delete the points added to the MST, applying an adaptively updated exit condition that judges whether the mini-MST generation should terminate at the targeted edge <italic>e</italic>
<sub>
<italic>i</italic>
</sub>:<disp-formula id="e5">
<mml:math id="m7">
<mml:mi>a</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>E</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>where <inline-formula id="inf3">
<mml:math id="m8">
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> and <inline-formula id="inf4">
<mml:math id="m9">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> are the size and the average weight of the current <italic>MEW</italic> that <italic>e</italic>
<sub>
<italic>i</italic>
</sub> is supposed to enter, respectively.</p>
<p>MST-based outliers. MST-based outliers are the points not added to any generated mini-MSTs. Our method does not require a given number of outliers; Instead, <italic>aec</italic> and <italic>T</italic>
<sub>
<italic>t</italic>
</sub> differentiate the different density clusters and outliers. The construction of the mini-MSTs finishes when the weight of the next edge is greater than the threshold, so that the points in this current mini-MST can be regarded as a cluster with the same density. Furthermore, a sliding window is applied to the edge weight denoted by the Euclidean distance. If the mean value of such a window is greater than <italic>T</italic>
<sub>
<italic>t</italic>
</sub>, the remaining points that are not ready to be added to the tree should be deemed outliers.</p>
</sec>
</sec>
<sec sec-type="methods" id="s4">
<title>4 Methods</title>
<sec id="s4-1">
<title>4.1 MST generation details and an illustrative example</title>
<p>In response to traditional MST-clustering-based outlier detection&#x2019;s weak performance on datasets with different densities, this work applies a novel distance measure scaled by the threshold of algorithm termination for better discrimination of normal points and outliers. Such a threshold could be considered a quasi-measure of noise in the dataset. The second algorithm improvement of this work targets efficiency: Mini-MSTs are built iteratively. Finishing a mini-MST generation in a cluster is followed by the deletion of processed points and a new construction procedure on the remaining points. An adaptive exit condition based on a progressively updated <italic>MEW</italic> qualifies the termination of the mini-MST building. A traditional MST algorithm, like Prim, is first applied to create an exact MST to find the point in the densest cluster. Subsequently, all edges are sorted in non-decreasing order to ensure that the first edge&#x2019;s two endpoints are in the densest cluster because the higher the cluster&#x2019;s density, the shorter the distances between its points. The edges between different density clusters are taken into account.</p>
<p>
<xref ref-type="fig" rid="F1">Figure 1</xref> illustrates a simplified case that embodies four clusters with different densities and six outliers. <italic>C</italic>
<sub>1</sub> is the densest cluster with the smallest average weight of edges. A Prim&#x2019;s MST is constructed first to find the point in the densest cluster, as <xref ref-type="fig" rid="F1">Figure 1A</xref> demonstrates. The shortest edge can be identified by sorting the edges in Prim&#x2019;s MST in non-decreasing order. Let <italic>s</italic> denote the start point of the shortest edge in <italic>C</italic>
<sub>1</sub>, from which a mini-MST is computed. Like Prim, the shortest edge is repeatedly added to the mini-MST until the next edge&#x2019;s weight is larger than the exit condition <italic>aec</italic> (see Eq. <xref ref-type="disp-formula" rid="e5">(5)</xref>). The points in the built mini-MST are labeled normal and removed from the dataset. The above steps are repeated from the point of the next densest cluster, which in this example is <italic>C</italic>
<sub>2</sub>, and the whole procedure ends with the adaptive exit condition being satisfied. The remaining points are considered outliers.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>An intuitive example of the adaptive mini-minimum spanning tree-based outlier detection (MMOD) method. <italic>C</italic>
<sub>1</sub>, <italic>C</italic>
<sub>2</sub>, <italic>C</italic>
<sub>3</sub>, and <italic>C</italic>
<sub>4</sub>: four clusters of different densities; <bold>(A)</bold> Prim&#x2019;s MST on the original dataset; <bold>(B)</bold>&#x2013;<bold>(E)</bold> procedure of iterative mini-MST construction; <bold>(F)</bold> detected outliers (in red).</p>
</caption>
<graphic xlink:href="fphys-14-1233341-g001.tif"/>
</fig>
</sec>
<sec id="s4-2">
<title>4.2 Adaptive mini-minimum spanning tree-based outlier detection (MMOD)</title>
<p>As can be observed from the example in <xref ref-type="sec" rid="s4-1">Section 4.1</xref>, the proposed method is centered on the iterative computation of mini-MSTs. Similarly to Prim, two arrays, <italic>labeled</italic>_<italic>data</italic> and <italic>unlabeled</italic>_<italic>data</italic>, are used to record data points added or not added to the tree, initialized by an empty set and all points, respectively. Unlike the traditional exact MST, the MST in our algorithm is constructed according to the data density, and an exit condition is added to obtain a mini-MST for efficiency. The edge weight of the mini-MST is a threshold-based Euclidean distance in place of the conventional Euclidean distance (see Eq. <xref ref-type="disp-formula" rid="e4">4</xref>). <italic>s</italic> denotes the start point of the mini-MST. Aside from the MST array used in Prim&#x2019;s MST, an additional <italic>ted</italic>_<italic>arr</italic> records the threshold-based distance between all data points. <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref> details the mini-MST construction.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>Mini-minimum spanning tree construction<list list-type="simple">
<list-item>
<p>
<bold>Require:</bold> a set of <italic>N</italic> data points, <italic>R</italic>; start point, <italic>s</italic>; <italic>labeled</italic>_<italic>data</italic>; <italic>unlabeled</italic>_<italic>data</italic>
</p>
</list-item>
<list-item>
<p>
<bold>Ensure:</bold> an MST</p>
</list-item>
<list-item>
<p>1:&#x2003;Let <bold>
<italic>MEW</italic>
</bold> denote mini edge weight set</p>
</list-item>
<list-item>
<p>2:&#x2003;Let <bold>
<italic>result_set</italic>
</bold> denote the generated MST</p>
</list-item>
<list-item>
<p>3:&#x2003;Let <bold>
<italic>ted_arr</italic>
</bold> denote <italic>N</italic> &#x2212; 1 threshold-based Euclidean distances</p>
</list-item>
<list-item>
<p>4:&#x2003;Let <bold>
<italic>edge_arr</italic>
</bold> denote the parents of the <italic>N</italic> data points</p>
</list-item>
<list-item>
<p>5:&#x2003;<bold>for</bold> i &#x2190; 1: N <bold>do</bold>
</p>
</list-item>
<list-item>
<p>6:&#x2003;&#x2003;<bold>
<italic>edge_arr</italic>
</bold> [<italic>i</italic>] &#x2190; <italic>s</italic>
</p>
</list-item>
<list-item>
<p>7:&#x2003;<inline-formula id="inf5">
<mml:math id="m10">
<mml:mi mathvariant="bold-italic">t</mml:mi>
<mml:mi mathvariant="bold-italic">e</mml:mi>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mtext>_</mml:mtext>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">r</mml:mi>
<mml:mi mathvariant="bold-italic">r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mi mathvariant="bold-italic">t</mml:mi>
<mml:mi mathvariant="bold-italic">e</mml:mi>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mfenced open="(" close="">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> point in <italic>R</italic>) (see Equation<xref ref-type="disp-formula" rid="e4"> 4</xref>)</p>
</list-item>
<list-item>
<p>8:&#x2003;<bold>end</bold> <bold>for</bold>
</p>
</list-item>
<list-item>
<p>9:&#x2003;choose another point <italic>p</italic> from <italic>R</italic> which is the nearest point to <italic>s</italic>
</p>
</list-item>
<list-item>
<p>10:&#x2003;add the edge denoted by <italic>s</italic>, <italic>p</italic>, <bold>
<italic>ted_arr</italic>
</bold> [<italic>s</italic>] to <bold>
<italic>result_set</italic>
</bold>
</p>
</list-item>
<list-item>
<p>11:&#x2003;move <italic>p</italic> from <italic>unlabeled</italic>_<italic>data</italic> to <italic>labeled</italic>_<italic>data</italic>
</p>
</list-item>
<list-item>
<p>12:&#x2003;initialize <italic>MEW</italic> with distance (<italic>s</italic>, <italic>p</italic>)</p>
</list-item>
<list-item>
<p>13:&#x2003;<bold>while</bold> True <bold>do</bold>
</p>
</list-item>
<list-item>
<p>14:&#x2003;&#x2003;initialize min&#x2009;_<italic>ted</italic> with <italic>&#x221e;</italic>;</p>
</list-item>
<list-item>
<p>15:&#x2003;&#x2003;<bold>for</bold> <italic>q</italic> in <italic>unlabeled</italic>_<italic>data</italic> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>16:&#x2003;&#x2003;&#x2003;<italic>last</italic>_<italic>weight</italic> &#x2190; <italic>ted</italic> (<italic>p</italic>, <italic>q</italic>)</p>
</list-item>
<list-item>
<p>17:&#x2003;&#x2003;&#x2003;<bold>if</bold> <italic>last</italic>_<italic>weight</italic> &#x3c; <bold>
<italic>ted_arr</italic>
</bold> [<italic>q</italic>] <bold>then</bold>
</p>
</list-item>
<list-item>
<p>18:&#x2003;&#x2003;&#x2003;&#x2003;update <bold>
<italic>ted_arr</italic>
</bold> with the <italic>last</italic>_<italic>weight</italic>
</p>
</list-item>
<list-item>
<p>19:&#x2003;&#x2003;&#x2003;&#x2003;update <bold>
<italic>edge_arr</italic>
</bold> with the index of <italic>p</italic>
</p>
</list-item>
<list-item>
<p>20:&#x2003;&#x2003;&#x2003;&#x2003;min&#x2009;_<italic>ted</italic> &#x2190; <italic>last</italic>_<italic>weight</italic>
</p>
</list-item>
<list-item>
<p>21:&#x2003;&#x2003;&#x2003;<bold>end</bold> <bold>if</bold>
</p>
</list-item>
<list-item>
<p>22:&#x2003;&#x2003;<bold>end</bold> <bold>for</bold>
</p>
</list-item>
<list-item>
<p>23:&#x2003;&#x2003;compute <italic>aec</italic> (<italic>e</italic>
<sub>
<italic>i</italic>
</sub>) according to Equation 5</p>
</list-item>
<list-item>
<p>24:&#x2003;&#x2003;<bold>if</bold> min&#x2009;_<italic>ted</italic> &#x3e; <italic>aec</italic> (<italic>e</italic>
<sub>
<italic>i</italic>
</sub>) <bold>then</bold> break</p>
</list-item>
<list-item>
<p>25:&#x2003;&#x2003;<bold>end</bold> <bold>if</bold>
</p>
</list-item>
<list-item>
<p>26:&#x2003;&#x2003;choose point <italic>r</italic> with smallest ted in <bold>
<italic>ted_arr</italic>
</bold>
</p>
</list-item>
<list-item>
<p>27:&#x2003;&#x2003;add the edge denoted by <italic>p</italic>, <italic>r</italic>, <bold>
<italic>ted_arr</italic>
</bold> [<italic>p</italic>] to <bold>
<italic>result_set</italic>
</bold>
</p>
</list-item>
<list-item>
<p>28:&#x2003;&#x2003;move <italic>r</italic> from <italic>unlabeled</italic>_<italic>data</italic> to <italic>labeled</italic>_<italic>data</italic>
</p>
</list-item>
<list-item>
<p>29:&#x2003;&#x2003;update <italic>p</italic> with <italic>r</italic>
</p>
</list-item>
<list-item>
<p>30:&#x2003;&#x2003;add <bold>
<italic>ted_arr</italic>
</bold> [<italic>p</italic>] to <italic>MEW</italic>
</p>
</list-item>
<list-item>
<p>31:&#x2003;<bold>end</bold> <bold>while</bold>
</p>
</list-item>
<list-item>
<p>32:&#x2003;<bold>return</bold> <bold>
<italic>result_set</italic>
</bold>, <bold>
<italic>edge_arr</italic>
</bold> and <bold>
<italic>ted_arr</italic>
</bold>
</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>Least number. To be noted, the number of points in the cluster falls within a certain range. A cluster containing too few points is considered an outlier cluster. <italic>least</italic>_<italic>number</italic> distinguishes normal clusters from outlier clusters:<disp-formula id="e6">
<mml:math id="m11">
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext>_</mml:mtext>
<mml:mi>n</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>D</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>where the <italic>ROUND</italic> function finds the closest integer to the parameter; <italic>N</italic> and <italic>n</italic> are the size and dimension of the dataset, respectively. If the number of edges of a mini-MST is less than <italic>least</italic>_<italic>number</italic>, the points belonging to the tree are marked as outliers.</p>
<p>Since the algorithm starts building MSTs from the densest cluster, it keeps outliers until all mini-MSTs have been generated. Therefore, the threshold of termination <italic>T</italic>
<sub>
<italic>t</italic>
</sub> (see Eq. <xref ref-type="disp-formula" rid="e2">2</xref>) can be applied to stop finding normal points. To compare with <italic>T</italic>
<sub>
<italic>t</italic>
</sub>, a window filled with the weights of the current edge and the following five edges is used: If the mean value of this window is greater than <italic>T</italic>
<sub>
<italic>t</italic>
</sub> (see Eq. <xref ref-type="disp-formula" rid="e2">2</xref>), the remaining data points will be treated as outliers.</p>
<p>
<xref ref-type="statement" rid="Algorithm_2">Algorithm 2</xref> provides the pseudocode of the proposed adaptive mini-MST-based outlier detection, which takes the input of the dataset <italic>R</italic> with <italic>N</italic> data points and its corresponding Prim&#x2019;s MST denoted by the edges. Each edge of the MST consists of a starting point, an endpoint, and an edge weight.</p>
<p>
<statement content-type="algorithm" id="Algorithm_2">
<label>Algorithm 2</label>
<p>Adaptive mini-minimum spanning tree-based outlier detection<list list-type="simple">
<list-item>
<p>
<bold>Require:</bold> dataset <italic>R</italic>
</p>
</list-item>
<list-item>
<p>
<bold>Ensure:</bold> a label array, <bold>
<italic>labels</italic>
</bold>
</p>
</list-item>
<list-item>
<p>1:&#x2003;<bold>
<italic>labels</italic>
</bold> &#x2190; [&#x2212;1]&#x2a;<italic>N</italic>
</p>
</list-item>
<list-item>
<p>2:&#x2003;compute a Prim&#x2019;s MST using Prim algorithm</p>
</list-item>
<list-item>
<p>3:&#x2003;sort the Prim&#x2019;s MST in non-decreasing order</p>
</list-item>
<list-item>
<p>4:&#x2003;compute <italic>T</italic>
<sub>
<italic>t</italic>
</sub> according to Equation 2</p>
</list-item>
<list-item>
<p>5:&#x2003;compute the <italic>least</italic>_<italic>number</italic> according to Equation 6</p>
</list-item>
<list-item>
<p>6:&#x2003;<bold>for</bold> edge in MST <bold>do</bold>
</p>
</list-item>
<list-item>
<p>7:&#x2003;&#x2003;<italic>s</italic> &#x2190;start point of edge</p>
</list-item>
<list-item>
<p>8:&#x2003;&#x2003;<bold>if</bold> one of the two ends of the edge is in <italic>labeled</italic>_<italic>data</italic> <bold>then</bold>
</p>
</list-item>
<list-item>
<p>9:&#x2003;&#x2003;&#x2003;continue</p>
</list-item>
<list-item>
<p>10:&#x2003;&#x2003;<bold>end</bold> <bold>if</bold>
</p>
</list-item>
<list-item>
<p>11:&#x2003;&#x2003;<italic>window</italic> &#x2190; the weight of the current edge and the next 5 edges;</p>
</list-item>
<list-item>
<p>12:&#x2003;&#x2003;<italic>edge</italic>_<italic>threshold</italic> &#x2190; the mean value of <italic>window</italic>
</p>
</list-item>
<list-item>
<p>13:&#x2003;&#x2003;<bold>if</bold> <italic>edge</italic>_<italic>threshold</italic> &#x3c; <italic>T</italic>
<sub>
<italic>t</italic>
</sub> <bold>then</bold> <italic>mini</italic>_<italic>mst</italic> &#x2190; <italic>Mini</italic>_<italic>MST</italic> (<italic>DS</italic>, <italic>s</italic>, <italic>labeled</italic>_<italic>data</italic>, <italic>unlabeled</italic>_<italic>data</italic>)</p>
</list-item>
<list-item>
<p>14:&#x2003;&#x2003;&#x2003;<bold>if</bold> len (<italic>mini</italic>_<italic>mst</italic>) <inline-formula id="inf6">
<mml:math id="m16">
<mml:mo>&#x3e;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext>_</mml:mtext>
<mml:mi>n</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
</mml:math>
</inline-formula> <bold>then</bold>
</p>
</list-item>
<list-item>
<p>15:&#x2003;&#x2003;&#x2003;&#x2003;labeled the two ends of the edges in <italic>mini</italic>_<italic>mst</italic> as normal</p>
</list-item>
<list-item>
<p>16:&#x2003;&#x2003;&#x2003;<bold>end</bold> <bold>if</bold>
</p>
</list-item>
<list-item>
<p>17:&#x2003;&#x2003;<bold>else</bold>
</p>
</list-item>
<list-item>
<p>18:&#x2003;&#x2003;&#x2003;break</p>
</list-item>
<list-item>
<p>19:&#x2003;&#x2003;<bold>end</bold> <bold>if</bold>
</p>
</list-item>
<list-item>
<p>20:&#x2003;<bold>end</bold> <bold>for</bold>
</p>
</list-item>
<list-item>
<p>21:&#x2003;<bold>return</bold> <bold>
<italic>labels</italic>
</bold>
</p>
</list-item>
</list>
</p>
</statement>
</p>
</sec>
</sec>
<sec id="s5">
<title>5 Experimental results and evaluation</title>
<sec id="s5-1">
<title>5.1 Applied datasets</title>
<p>Ten experiments were conducted on different real-world datasets, as summarized in <xref ref-type="table" rid="T1">Table 1</xref>, to demonstrate MMOD&#x2019;s applicability on the benchmark <xref ref-type="bibr" rid="B5">Campos et al. (2016)</xref>. The datasets will be introduced in detail in <xref ref-type="sec" rid="s5-4">Section 5.4</xref>, along with the results of the experiments carried out on them.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Description of the applied real-world datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Dataset</th>
<th align="right">Number of Samples</th>
<th align="right">Number of Outliers</th>
<th align="right">Number of Attributes</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">HeartDisease</td>
<td align="right">270</td>
<td align="right">120</td>
<td align="right">13</td>
</tr>
<tr>
<td align="left">Parkinson</td>
<td align="right">195</td>
<td align="right">147</td>
<td align="right">22</td>
</tr>
<tr>
<td align="left">Pima</td>
<td align="right">768</td>
<td align="right">268</td>
<td align="right">8</td>
</tr>
<tr>
<td align="left">SpamBase</td>
<td align="right">4601</td>
<td align="right">1813</td>
<td align="right">57</td>
</tr>
<tr>
<td align="left">WDBC_v05</td>
<td align="right">367</td>
<td align="right">10</td>
<td align="right">30</td>
</tr>
<tr>
<td align="left">WDBC_v06</td>
<td align="right">367</td>
<td align="right">10</td>
<td align="right">30</td>
</tr>
<tr>
<td align="left">WDBC_v07</td>
<td align="right">367</td>
<td align="right">10</td>
<td align="right">30</td>
</tr>
<tr>
<td align="left">WDBC_v08</td>
<td align="right">367</td>
<td align="right">10</td>
<td align="right">30</td>
</tr>
<tr>
<td align="left">WDBC_v09</td>
<td align="right">367</td>
<td align="right">10</td>
<td align="right">30</td>
</tr>
<tr>
<td align="left">WDBC_v10</td>
<td align="right">367</td>
<td align="right">10</td>
<td align="right">30</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As a supplement, experiments on five synthetic two-dimensional datasets with different morphologies are added to demonstrate MMOD&#x2019;s parameter tuning and its availability on manually generated data; plus, the two-dimensional visualization is intuitive and well-readable (see <xref ref-type="sec" rid="s12">Appendix</xref>).</p>
</sec>
<sec id="s5-2">
<title>5.2 State-of-the-art methods for comparison</title>
<p>MMOD&#x2019;s experimental results were compared with eight algorithms from the Python outlier detection package <xref ref-type="bibr" rid="B42">Zhao et al. (2019)</xref>, including four classical algorithms, <italic>k</italic>-NN <xref ref-type="bibr" rid="B31">Ramaswamy et al. (2000)</xref>, LOF <xref ref-type="bibr" rid="B14">Jahanbegloo and Jahanbegloo (2000)</xref>, angle-based outlier detection (ABOD) <xref ref-type="bibr" rid="B29">Pham and Pagh (2012)</xref>, and histogram-based outlier score (HBOS) <xref ref-type="bibr" rid="B13">Goldstein and Dengel (2012)</xref>, as well as four recent algorithms, one class support vector machine (OCSVM) <xref ref-type="bibr" rid="B10">Erfani et al. (2016)</xref>, lightweight online detector of anomalies (LODA) <xref ref-type="bibr" rid="B28">Pevn&#xfd; (2016)</xref>, locally selective combination of parallel outlier ensembles (LSCP), and multiple-objective generative adversarial active learning (MOGAAL) <xref ref-type="bibr" rid="B25">Liu et al. (2019)</xref> <xref ref-type="bibr" rid="B41">Zhao et al. (2018)</xref>. LOF and <italic>k</italic>-NN are classical density-based and distance-based methods, respectively. ABOD is developed for high-dimensional feature space datasets to alleviate the &#x201c;curse of dimensionality,&#x201d; an efficient version of which was used in our experiments. HBOS is an unsupervised outlier detection method that computes the outsiderness degree by building histograms. OCSVM is an extension of the support vector algorithm that learns a kernel function called the decision boundary, distinguishing outliers from inliers. LODA is operative for data streams and real-time applications. LSCP, also unsupervised, chooses the competent detectors by using the local region of the data points. The newly presented MOGAAL is based on a generative adversarial active learning neural network.</p>
<p>To generate a fair comparison reference, the <italic>k</italic>
<sub>
<italic>threshold</italic>
</sub> value for each state-of-the-art method being compared was set to 7, a typical value setting. Literature such as <xref ref-type="bibr" rid="B5">Campos et al. (2016)</xref> records the performance of other <italic>k</italic>
<sub>
<italic>threshold</italic>
</sub> values on most reference methods. The outlier percentage is calculated as the number of outliers divided by the size of the dataset. All experiments were run through Python 3.6.5 on a computer with an Intel<sup>&#xae;</sup> Core&#x2122; 3.2 GHz i5-3470 CPU and 4 GB RAM.</p>
</sec>
<sec id="s5-3">
<title>5.3 Evaluation metrics</title>
<p>Conventional evaluation metrics precision, recall, and <italic>F</italic>-measure were applied to analyze and compare the experimental results on real-world datasets. Let <italic>m</italic> denote the number of correct outliers returned by the detector, <italic>n</italic> denote the total number of all outliers returned by the detector, and <italic>o</italic> denote the number of ground-truth outliers. The precision <italic>P</italic> is the proportion of correct outliers in all outliers identified by the detector:<disp-formula id="e7">
<mml:math id="m13">
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>The recall <italic>R</italic> is the proportion of correct outliers that the detector returns in all ground-truth outliers:<disp-formula id="e8">
<mml:math id="m14">
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>The <italic>F</italic>-measure is the harmonic mean of precision and recall:<disp-formula id="e9">
<mml:math id="m15">
<mml:mi>F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
</sec>
<sec id="s5-4">
<title>5.4 Results on real-world datasets</title>
<p>Applying real-world datasets can demonstrate the effectiveness of the proposed method straightforwardly. Since medical data are one of the most prominent application scenarios of outlier detection, nine widely investigated open-source medical datasets are utilized for experiments. A spam dataset is additionally brought into the experiment as a case for other domain applications. On each real-world dataset, default MMOD parameter settings or formulas defined in <xref ref-type="sec" rid="s3-2">Section 3.2</xref> were adopted, such as the <italic>MEW</italic>&#x2019;s first added value <italic>MEW</italic>
<sub>1</sub>, the threshold-based Euclidean distance <italic>ted</italic>, and the exit condition <italic>aec</italic>, which evidences the broad applicability of MMOD without parameter tuning. The precision, recall, and <italic>F</italic>-measure values of MMOD&#x2019;s and the experimental results of the peer methods&#x2019; are entirely recorded in <xref ref-type="table" rid="T1">Tables 1</xref>&#x2013;<xref ref-type="table" rid="T3">3</xref>, of which the statistics are plotted in <xref ref-type="fig" rid="F2">Figures 2</xref>&#x2013;<xref ref-type="fig" rid="F4">4</xref> for visual comparison.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>The precisions of experimental results from MMOD and eight state-of-the-art algorithms on the real-world datasets. The best performance on each dataset is indicated in bold.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Dataset</th>
<th align="center">MMOD</th>
<th align="right">ABOD</th>
<th align="right">HBOS</th>
<th align="right">KNN</th>
<th align="right">LODA</th>
<th align="right">LOF</th>
<th align="right">LSCP</th>
<th align="right">MOGAAL</th>
<th align="right">OCSVM</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">HeartDisease</td>
<td align="center">0.44</td>
<td align="right">0.52</td>
<td align="right">
<bold>0.70</bold>
</td>
<td align="right">0.50</td>
<td align="right">0.28</td>
<td align="right">0.45</td>
<td align="right">0.44</td>
<td align="right">0.35</td>
<td align="right">0.59</td>
</tr>
<tr>
<td align="left">Parkinson</td>
<td align="center">0.75</td>
<td align="right">0.85</td>
<td align="right">
<bold>1.00</bold>
</td>
<td align="right">0.90</td>
<td align="right">0.90</td>
<td align="right">0.85</td>
<td align="right">0.90</td>
<td align="right">0.65</td>
<td align="right">0.70</td>
</tr>
<tr>
<td align="left">Pima</td>
<td align="center">0.35</td>
<td align="right">0.53</td>
<td align="right">
<bold>0.66</bold>
</td>
<td align="right">0.49</td>
<td align="right">0.44</td>
<td align="right">0.45</td>
<td align="right">0.45</td>
<td align="right">0.47</td>
<td align="right">0.52</td>
</tr>
<tr>
<td align="left">SpamBase</td>
<td align="center">0.41</td>
<td align="right">0.00</td>
<td align="right">
<bold>0.53</bold>
</td>
<td align="right">0.36</td>
<td align="right">0.14</td>
<td align="right">0.49</td>
<td align="right">0.35</td>
<td align="right">0.15</td>
<td align="right">0.25</td>
</tr>
<tr>
<td align="left">WDBC_v05</td>
<td align="center">
<bold>0.80</bold>
</td>
<td align="right">0.50</td>
<td align="right">0.20</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.70</td>
<td align="right">0.30</td>
<td align="right">0.24</td>
<td align="right">0.00</td>
<td align="right">0.03</td>
</tr>
<tr>
<td align="left">WDBC_v06</td>
<td align="center">
<bold>0.70</bold>
</td>
<td align="right">
<bold>0.70</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.60</td>
<td align="right">0.50</td>
<td align="right">0.60</td>
<td align="right">0.24</td>
<td align="right">0.00</td>
<td align="right">0.00</td>
</tr>
<tr>
<td align="left">WDBC_v07</td>
<td align="center">0.48</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.20</td>
<td align="right">0.70</td>
<td align="right">0.70</td>
<td align="right">0.70</td>
<td align="right">0.24</td>
<td align="right">0.00</td>
<td align="right">0.03</td>
</tr>
<tr>
<td align="left">WDBC_v08</td>
<td align="center">0.48</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.20</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.40</td>
<td align="right">0.27</td>
<td align="right">0.00</td>
<td align="right">0.03</td>
</tr>
<tr>
<td align="left">WDBC_v09</td>
<td align="center">
<bold>0.86</bold>
</td>
<td align="right">0.50</td>
<td align="right">0.10</td>
<td align="right">0.60</td>
<td align="right">0.50</td>
<td align="right">0.50</td>
<td align="right">0.24</td>
<td align="right">0.00</td>
<td align="right">0.03</td>
</tr>
<tr>
<td align="left">WDBC_v10</td>
<td align="center">0.77</td>
<td align="right">
<bold>0.90</bold>
</td>
<td align="right">0.10</td>
<td align="right">
<bold>0.90</bold>
</td>
<td align="right">
<bold>0.90</bold>
</td>
<td align="right">0.20</td>
<td align="right">0.27</td>
<td align="right">0.00</td>
<td align="right">0.03</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>The recalls of experimental results from MMOD and eight state-of-the-art algorithms on the real-world datasets. The best performance on each dataset is indicated in bold.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Dataset</th>
<th align="center">MMOD</th>
<th align="right">ABOD</th>
<th align="right">HBOS</th>
<th align="right">KNN</th>
<th align="right">LODA</th>
<th align="right">LOF</th>
<th align="right">LSCP</th>
<th align="right">MOGAAL</th>
<th align="right">OCSVM</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">HeartDisease</td>
<td align="center">
<bold>1.00</bold>
</td>
<td align="right">0.52</td>
<td align="right">0.70</td>
<td align="right">0.50</td>
<td align="right">0.28</td>
<td align="right">0.45</td>
<td align="right">0.10</td>
<td align="right">0.35</td>
<td align="right">0.13</td>
</tr>
<tr>
<td align="left">Parkinson</td>
<td align="center">
<bold>1.00</bold>
</td>
<td align="right">0.12</td>
<td align="right">0.14</td>
<td align="right">0.12</td>
<td align="right">0.12</td>
<td align="right">0.12</td>
<td align="right">0.12</td>
<td align="right">0.09</td>
<td align="right">0.10</td>
</tr>
<tr>
<td align="left">Pima</td>
<td align="center">
<bold>1.00</bold>
</td>
<td align="right">0.15</td>
<td align="right">0.19</td>
<td align="right">0.14</td>
<td align="right">0.13</td>
<td align="right">0.13</td>
<td align="right">0.13</td>
<td align="right">0.13</td>
<td align="right">0.15</td>
</tr>
<tr>
<td align="left">SpamBase</td>
<td align="center">
<bold>1.00</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.13</td>
<td align="right">0.09</td>
<td align="right">0.04</td>
<td align="right">0.12</td>
<td align="right">0.09</td>
<td align="right">0.04</td>
<td align="right">0.06</td>
</tr>
<tr>
<td align="left">WDBC_v05</td>
<td align="center">0.80</td>
<td align="right">0.50</td>
<td align="right">0.20</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.70</td>
<td align="right">0.30</td>
<td align="right">
<bold>0.90</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.10</td>
</tr>
<tr>
<td align="left">WDBC_v06</td>
<td align="center">0.70</td>
<td align="right">0.70</td>
<td align="right">0.00</td>
<td align="right">0.60</td>
<td align="right">0.50</td>
<td align="right">0.60</td>
<td align="right">
<bold>0.90</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.00</td>
</tr>
<tr>
<td align="left">WDBC_v07</td>
<td align="center">
<bold>1.00</bold>
</td>
<td align="right">0.80</td>
<td align="right">0.20</td>
<td align="right">0.70</td>
<td align="right">0.70</td>
<td align="right">0.70</td>
<td align="right">0.90</td>
<td align="right">0.00</td>
<td align="right">0.10</td>
</tr>
<tr>
<td align="left">WDBC_v08</td>
<td align="center">
<bold>1.00</bold>
</td>
<td align="right">0.80</td>
<td align="right">0.20</td>
<td align="right">0.80</td>
<td align="right">0.80</td>
<td align="right">0.40</td>
<td align="right">
<bold>1.00</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.10</td>
</tr>
<tr>
<td align="left">WDBC_v09</td>
<td align="center">0.60</td>
<td align="right">0.50</td>
<td align="right">0.10</td>
<td align="right">0.60</td>
<td align="right">0.50</td>
<td align="right">0.50</td>
<td align="right">
<bold>0.90</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.10</td>
</tr>
<tr>
<td align="left">WDBC_v10</td>
<td align="center">
<bold>1.00</bold>
</td>
<td align="right">0.90</td>
<td align="right">0.10</td>
<td align="right">0.90</td>
<td align="right">0.90</td>
<td align="right">0.20</td>
<td align="right">
<bold>1.00</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.10</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>The <italic>F</italic>-measures of experimental results from MMOD and eight state-of-the-art algorithms on the real-world datasets. The best performance on each dataset is indicated in bold.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Dataset</th>
<th align="center">MMOD</th>
<th align="right">ABOD</th>
<th align="right">HBOS</th>
<th align="right">KNN</th>
<th align="right">LODA</th>
<th align="right">LOF</th>
<th align="right">LSCP</th>
<th align="right">MOGAAL</th>
<th align="right">OCSVM</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">HeartDisease</td>
<td align="center">0.62</td>
<td align="right">0.52</td>
<td align="right">
<bold>0.70</bold>
</td>
<td align="right">0.50</td>
<td align="right">0.28</td>
<td align="right">0.45</td>
<td align="right">0.16</td>
<td align="right">0.35</td>
<td align="right">0.22</td>
</tr>
<tr>
<td align="left">Parkinson</td>
<td align="center">
<bold>0.86</bold>
</td>
<td align="right">0.20</td>
<td align="right">0.24</td>
<td align="right">0.22</td>
<td align="right">0.22</td>
<td align="right">0.20</td>
<td align="right">0.22</td>
<td align="right">0.16</td>
<td align="right">0.17</td>
</tr>
<tr>
<td align="left">Pima</td>
<td align="center">
<bold>0.52</bold>
</td>
<td align="right">0.24</td>
<td align="right">0.30</td>
<td align="right">0.22</td>
<td align="right">0.20</td>
<td align="right">0.20</td>
<td align="right">0.20</td>
<td align="right">0.21</td>
<td align="right">0.23</td>
</tr>
<tr>
<td align="left">SpamBase</td>
<td align="center">
<bold>0.58</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.21</td>
<td align="right">0.15</td>
<td align="right">0.06</td>
<td align="right">0.19</td>
<td align="right">0.14</td>
<td align="right">0.06</td>
<td align="right">0.10</td>
</tr>
<tr>
<td align="left">WDBC_v05</td>
<td align="center">
<bold>0.80</bold>
</td>
<td align="right">0.50</td>
<td align="right">0.20</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.70</td>
<td align="right">0.30</td>
<td align="right">0.38</td>
<td align="right">0.00</td>
<td align="right">0.04</td>
</tr>
<tr>
<td align="left">WDBC_v06</td>
<td align="center">
<bold>0.70</bold>
</td>
<td align="right">
<bold>0.70</bold>
</td>
<td align="right">0.00</td>
<td align="right">0.60</td>
<td align="right">0.50</td>
<td align="right">0.60</td>
<td align="right">0.38</td>
<td align="right">0.00</td>
<td align="right">0.00</td>
</tr>
<tr>
<td align="left">WDBC_v07</td>
<td align="center">0.65</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.20</td>
<td align="right">0.70</td>
<td align="right">0.70</td>
<td align="right">0.70</td>
<td align="right">0.38</td>
<td align="right">0.00</td>
<td align="right">0.04</td>
</tr>
<tr>
<td align="left">WDBC_v08</td>
<td align="center">0.65</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.20</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">
<bold>0.80</bold>
</td>
<td align="right">0.40</td>
<td align="right">0.43</td>
<td align="right">0.00</td>
<td align="right">0.04</td>
</tr>
<tr>
<td align="left">WDBC_v09</td>
<td align="center">
<bold>0.71</bold>
</td>
<td align="right">0.50</td>
<td align="right">0.10</td>
<td align="right">0.60</td>
<td align="right">0.50</td>
<td align="right">0.50</td>
<td align="right">0.38</td>
<td align="right">0.00</td>
<td align="right">0.04</td>
</tr>
<tr>
<td align="left">WDBC_v10</td>
<td align="center">0.87</td>
<td align="right">
<bold>0.90</bold>
</td>
<td align="right">0.10</td>
<td align="right">0.90</td>
<td align="right">0.90</td>
<td align="right">0.20</td>
<td align="right">0.43</td>
<td align="right">0.00</td>
<td align="right">0.04</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The precision of MMOD&#x2019;s and peer methods&#x2019; experimental results on the real-world datasets.</p>
</caption>
<graphic xlink:href="fphys-14-1233341-g002.tif"/>
</fig>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>The recall of MMOD&#x2019;s and peer methods&#x2019; experimental results on the real-world datasets.</p>
</caption>
<graphic xlink:href="fphys-14-1233341-g003.tif"/>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>The <italic>F</italic>-measure of MMOD&#x2019;s and peer methods&#x2019; experimental results on the real-world datasets.</p>
</caption>
<graphic xlink:href="fphys-14-1233341-g004.tif"/>
</fig>
<sec id="s5-4-1">
<title>5.4.1 The HeartDisease dataset</title>
<p>HeartDisease contains 270 instances, of which 120 outliers represent patients, and the rest describe healthy individuals, showing a close number of normal samples and outliers. Normalized, unduplicated data were used for the experiments of the nine algorithms. Overall, all methods did not perform ideally on this dataset. The highest precision and recall were generated by HBOS and MMOD, respectively. Regarding the <italic>F</italic>-measure, MMOD came in second place, slightly below HBOS, while the rest of the methods did not exceed 0.6. It is worth noting that MMOD&#x2019;s perfect recall. In terms of dataset composition, HeartDisease is the only one from all participating datasets with normal samples and outliers close to half-and-half. With such a high percentage of outliers (only lower than Parkinson), there are only 13 attributes used for detection (the second fewest), which evidences the difficulty of detection. Nevertheless, MMOD detected all outliers without any missing, despite causing many false identifications. In contrast, although HBOS has a higher <italic>F</italic>-measure than MMOD with a 0.80 gap, it has a recall loss of 0.30, which is too high a leakage rate, being insensitive for disease detection.</p>
</sec>
<sec id="s5-4-2">
<title>5.4.2 The Parkinson dataset</title>
<p>To evaluate MMOD&#x2019;S effectiveness on a large percentage of outliers, we use the normalized, unduplicated Parkinson dataset, consisting of 195 instances, among which 147 are Parkinson&#x2019;s disease patients as outliers. Due to the outlier percentage being larger than 50%, no parameters were passed to the eight peer algorithms. All methods performed acceptably in terms of precision, but MMOD is the only one standing out in terms of recall, contributing to its far-leading <italic>F</italic>-measure. For comparison, the <italic>F</italic>-measures of all peer algorithms are below 0.5. Compared to HeartDisease, Parkinson has a substantially higher percentage of outliers, over three-quarters, the highest among all datasets applied. Meanwhile, its total number of attributes is sizably greater than HeartDisease, at 22. Regarding Parkinson&#x2019;s detection, MMOD&#x2019;s perfect recall means no miss.</p>
</sec>
<sec id="s5-4-3">
<title>5.4.3 The Pima dataset for diabetes</title>
<p>Another normalized, unduplicated medical dataset, Pima, contains 768 cases, including 268 diabetic patients as outliers. Each sample is composed of 8 attributes. MMOD works imperfectly in terms of precision, although all methods are not bright; however, MMOD&#x2019;s recall and <italic>F</italic>-measure are highlights. <xref ref-type="table" rid="T1">Table 1</xref> implies that Pima contains exactly 500 normal samples, which makes the proportion of outliers about 34.90%, roughly one-third of the total data, for which the number of attributes used to describe the samples is the lowest of all the datasets. MMOD succeeded in detecting all diabetic cases but resulted in a certain number of false positives. Similar to Parkinson, on the <italic>F</italic>-measure, which indicates the overall performance, MMOD outperformed the other methods by a large margin, as none of the others exceeded 0.30.</p>
</sec>
<sec id="s5-4-4">
<title>5.4.4 The WDBC corpus and its variation sets for breast cancer</title>
<p>WDBC describes the nuclear characteristics of a breast cancer diagnosis, whose different variation datasets used in our experiments are randomly downsampled from the original classification dataset for outlier detection <xref ref-type="bibr" rid="B39">Zhang et al. (2009)</xref>. Each variation of WDBC contains 367 samples, among which there are 10 outliers representing malignant cancers, while other instances indicate benign cancers. Therefore, the outlier proportion of the five WDBC datasets is uniform and tiny, about 2.72%, much smaller than others. Nevertheless, a relatively higher number of attributes are used to characterize the samples, reaching 30, the second highest. The nine algorithms, including MMOD, were experimented on the WDBC dataset&#x2019;s six unnormalized, unduplicated subsets. For all applied WDBC datasets, MOGAAL was unable to identify any outliers, quitting the competition early.</p>
<p>For WDBC_v05, the proposed MMOD achieves the highest precision of 0.8, along with KNN, followed by LODA with 0.7. None of the other methods achieves a precision greater than 0.5 in this dataset. Regarding recall, LSCP achieves 0.9, while MMOD and KNN are 0.8. Nonetheless, LSCP&#x2019;s <italic>F</italic>-measure is underperforming due to its low precision, while MMOD and KNN win at <italic>F</italic>-measure. MMOD on WDBC_v06 and WDBC_v09 also yielded similar situations of &#x201c;optimal precision, suboptimal recall, and best <italic>F</italic>-measure,&#x201d; just that ABOD replaced KNN as the joint winner on WDBC_v06, while MMOD alone performed best on WDBC_v09. It is noteworthy that besides MOGAAL, HBOS and OCSVM also failed on WDBC_v06. MMOD&#x2019;s performance metrics on WDBC_v07, WDBC_v08, and WDBC_v10 are similar: perfect recalls with non-optimal precisions and <italic>F</italic>-measures.</p>
<p>A perfect recall of 1 means that all true malignancies are found without missing, while suboptimal precision represents the presence of a false positive chance. Overall, MMOD has a relatively high recall on WDBC, slightly inferior to LSCP (MMOD is higher only on WDBC_v06, while on par or lower at rest). Still, given LSCP&#x2019;s inferior precision, it can be claimed that MMOD works well overall on WDBC_v05&#x2013;WDBC_v10, as evidenced also by the <italic>F</italic>-measures. It can also be observed from <xref ref-type="fig" rid="F4">Figure 4</xref> that MMOD&#x2019;s <italic>F</italic>-measure performance is relatively stable among a group of algorithms.</p>
</sec>
<sec id="s5-4-5">
<title>5.4.5 The SpamBase dataset</title>
<p>Additionally, an email dataset beyond medical scenarios, SpamBase, was applied, which consists of 4,601 objects of 57 attributes, 1,813 of which are spam emails as outliers. It is considerably formidable to detect outliers in such a dataset. Like in WDBC, MOGAAL did not manage to work. MMOD ranks third in precision, while its recall is again far ahead, leading to the winning <italic>F</italic>-measure. In addition to having the most significant number of samples and attributes, SpamBase has a large outlier quantity, accounting for 39.40%. This relatively &#x201c;big&#x201d; data witnessed MMOD&#x2019;s report card of not missing any spam. All other algorithms have weak <italic>F</italic>-measures worse than 0.21.</p>
</sec>
</sec>
<sec id="s5-5">
<title>5.5 Comprehensive performance analysis and discussion</title>
<p>MMOD has perfect or nearly perfect recalls on most datasets, which should be attributed to its ability to greatly retain possible outliers, enabled by the adaptive exit condition. Such an adaptive termination mechanism also improves the efficiency of the algorithm. LSCP&#x2019;s recall performance is comparable to MMOD on WDBS, but on the one hand, its recall is extremely worse than MMOD on the other datasets; on the other hand, its precision on WDBS is also significantly inferior to MMOD. Regarding precision, HBOS works well on four datasets, but is overall unstable and particularly poor on the other six. MMOD is optimal in three datasets and at an average level globally.</p>
<p>Two of the advanced aspects of MMOD are that it does not require the number of outliers as input and that it is outlier quantity and proportion insensitive. Such a characteristic was well reflected in the experimental results. The applied datasets include various outlier percentages, such as a small portion of outliers, a large percentage of outliers, and a close proportion of outliers and normal samples. Evidently, most of the peer methods are affected by such setups. For datasets with a high percentage of outliers, such as HeartDisease, Parkinson, Pima, and SpamBase, HBOS has high precision values; however, for cases with a low percentage of outliers, HBOS&#x2019;s precision almost hits rock bottom. Worse, its recalls are always poor, no matter the outlier percentage. <italic>k</italic>-NN, LODA, and LSCP are almost the opposite. <italic>k</italic>-NN and LODA&#x2019;s precision and recall on datasets with a low percentage of outliers are significantly better than the case with a high percentage of outliers. LSCP&#x2019;s recall is excellent when the percentage of outliers is low; for the high percentage of outliers, LSCP is almost incapable, not to mention its unsatisfying precision all the time. As a comparison, MMOD&#x2019;s performance is more consistent regardless of the outlier percentage, without dramatically poor metric values. Its recall is especially consistently splendid, its precision is in the middle of the pack, and its <italic>F</italic>-measure is relatively robust, all verifying that MMOD, which does not take outlier numbers or percentages as inputs, works insensitively to outlier quantity and proportion.</p>
<p>Which one of recall and precision is more valued during outlier detection is relevant to the application scenario. For medical data, especially disease diagnosis, recall is related to whether cases with real diseases will be missed. The preliminary validation experiments of MMOD&#x2019;s method suggest its usability on medical data.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>Outlier detection is an important approach to data mining, which is widely studied in medical scenarios. MST has been widely applied to clustering and outlier detection as an essential data structure in graph theory. In order to overcome the problems in distance-based and density-based outlier detection, an adaptive mini-minimum spanning tree-based outlier detection (MMOD) method was proposed in this article, employing threshold-based Euclidean distances as the edge weight and adaptive exit conditions of mini-MST generation, to improve efficiency. MMOD does not require the outlier percentage as an input parameter, which peer outlier detection algorithms usually need. Moreover, MMOD can detect outliers in datasets with different densities and is insensitive to the outlier proportion and distribution. A series of experiments in real-world medical datasets manifested the promising results of MMOD; additional spam and five synthetic datasets further validated its applicability. Topics on MST-based outlier detection methods, such as the quantitative measurement of outsiderness degree, remain research values in the future.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://github.com/laetella/MMOD">https://github.com/laetella/MMOD</ext-link>.</p>
</sec>
<sec id="s8">
<title>Author contributions</title>
<p>Conceptualization, methodology, implementation, and experiment, JL; validation, JwL, TS, and HL; investigation, JwL; visualization, JL and HL; analysis and discussion, JL, JwL, and HL; writing&#x2014;original draft preparation, JL and HL; writing&#x2014;review and rewriting, JL and HL; supervision, CW, FV, and HL; funding acquisition, HL and CW; All authors contributed to the article and approved the submitted version<italic>.</italic>
</p>
</sec>
<sec id="s9">
<title>Funding</title>
<p>The APC was funded by the Open Access Initiative of the University of Bremen and the DFG <italic>via</italic> SuUB Bremen. The research is partially supported by the National Natural Science Foundation of China (No. 62272379) and the Natural Science Basic Research Plan in Shaanxi Province (2021JM-018).</p>
</sec>
<ack>
<p>We extend our sincere gratitude to Xiaochun Wang for her help in conceptualization and methodology. This work was started with the support of Fund 2020JM-046, for which she is responsible.</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s12">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fphys.2023.1233341/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fphys.2023.1233341/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Amagata</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Onizuka</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hara</surname>
<given-names>T.</given-names>
</name>
</person-group> &#x201c;<article-title>Fast and exact outlier detection in metric spaces: A proximity graph-based approach</article-title>,&#x201d; in <conf-name>Proceedings of the 2021 International Conference on Management of Data</conf-name>, <conf-loc>Virtual Event China</conf-loc>, <conf-date>June 2021</conf-date>, <fpage>36</fpage>&#x2013;<lpage>48</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Amagata</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Onizuka</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hara</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Fast, exact, and parallel-friendly outlier detection algorithms with proximity graph in metric spaces</article-title>. <source>VLDB J</source>, <volume>31</volume>. <pub-id pub-id-type="doi">10.1007/s00778-022-00729-1</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Angiulli</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fassetti</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Distance-based outlier queries in data streams: the novel task and algorithms</article-title>. <source>Data Min. Knowl. Discov.</source> <volume>20</volume>, <fpage>290</fpage>&#x2013;<lpage>324</lpage>. <pub-id pub-id-type="doi">10.1007/s10618-009-0159-9</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Atkinson</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Hawkins</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>1980</year>). <article-title>Identification of outliers</article-title>. <source>Biometrics</source> <volume>37</volume>, <fpage>860</fpage>. <pub-id pub-id-type="doi">10.2307/2530182</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Campos</surname>
<given-names>G. O.</given-names>
</name>
<name>
<surname>Zimek</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sander</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Campello</surname>
<given-names>R. J. G. B.</given-names>
</name>
<name>
<surname>Micenkov&#xe1;</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Schubert</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>On the evaluation of unsupervised outlier detection: measures, datasets, and an empirical study</article-title>. <source>Data Min. Knowl. Discov.</source> <volume>30</volume>, <fpage>891</fpage>&#x2013;<lpage>927</lpage>. <pub-id pub-id-type="doi">10.1007/s10618-015-0444-8</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kuhlman</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Rundensteiner</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Eltabakh</surname>
<given-names>M.</given-names>
</name>
</person-group> &#x201c;<article-title>Multi-tactic distance-based outlier detection</article-title>,&#x201d; in <conf-name>Proceedings of the 2017 IEEE 33rd International Conference on Data Engineering (ICDE)</conf-name>, <conf-loc>San Diego, CA, USA</conf-loc>, <conf-date>April 2017</conf-date>, <fpage>959</fpage>&#x2013;<lpage>970</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Chawla</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gionis</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>k -means&#x2013;: A unified approach to clustering and outlier detection</article-title>. <ext-link ext-link-type="uri" xlink:href="https://epubs.siam.org/doi/pdf/10.1137/1.9781611972832.21">https://epubs.siam.org/doi/pdf/10.1137/1.9781611972832.21</ext-link>.</citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Corain</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Garza</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Asudeh</surname>
<given-names>A.</given-names>
</name>
</person-group> &#x201c;<article-title>Dbscout: A density-based method for scalable outlier detection in very large datasets</article-title>,&#x201d; in <conf-name>Proceedings of the ICDE (IEEE)</conf-name>, <conf-loc>Chania, Greece</conf-loc>, <conf-date>April 2021</conf-date>, <fpage>37</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1109/icde51399.2021.00011</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Degirmenci</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Karal</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Efficient density and cluster based incremental outlier detection in data streams</article-title>. <source>Inf. Sci.</source> <volume>607</volume>, <fpage>901</fpage>&#x2013;<lpage>920</lpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2022.06.013</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Erfani</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Rajasegarar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Karunasekera</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Leckie</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>High-dimensional and large-scale anomaly detection using a linear one-class svm with deep learning</article-title>. <source>Pattern Recognit.</source> <volume>58</volume>, <fpage>121</fpage>&#x2013;<lpage>134</lpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2016.03.028</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fiore</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Santis]</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Perla</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zanetti</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Palmieri</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Using generative adversarial networks for improving classification effectiveness in credit card fraud detection</article-title>. <source>Inf. Sci.</source> <volume>479</volume>, <fpage>448</fpage>&#x2013;<lpage>455</lpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2017.12.030</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Folgado</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Barandas</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Antunes</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nunes</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hartmann</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Tssearch: time series subsequence search library</article-title>. <source>SoftwareX</source> <volume>18</volume>, <fpage>101049</fpage>. <pub-id pub-id-type="doi">10.1016/j.softx.2022.101049</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goldstein</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dengel</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Histogram-based outlier score (hbos): A fast unsupervised anomaly detection algorithm</article-title>. <source>KI-2012 Poster Demo Track</source>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jahanbegloo</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jahanbegloo</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>Lof: identifying density-based local outliers</article-title>. <source>SIGMOD</source> <volume>26</volume>, <fpage>1</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1145/335191.335388</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>M. F.</given-names>
</name>
<name>
<surname>Tseng</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>C. M.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Two-phase clustering process for outliers detection</article-title>. <source>Pattern Recognit. Lett.</source> <pub-id pub-id-type="doi">10.1016/S0167-8655(00)00131-8</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Islam</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pecht</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>A hybrid feature selection scheme for reducing diagnostic performance deterioration caused by outliers in data-driven diagnostics</article-title>. <source>IEEE Trans. Industrial Electron.</source> <volume>63</volume>, <fpage>3299</fpage>&#x2013;<lpage>3310</lpage>. <pub-id pub-id-type="doi">10.1109/TIE.2016.2527623</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Knorr</surname>
<given-names>E. M.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>R. T.</given-names>
</name>
</person-group> &#x201c;<article-title>Algorithms for mining distance-based outliers in large datasets</article-title>,&#x201d; in <conf-name>Proceedings of the 24rd International Conference on Very Large Data Bases</conf-name>, <conf-loc>San Francisco, CA, United States</conf-loc>, <conf-date>August 1998</conf-date>, <fpage>392</fpage>&#x2013;<lpage>403</lpage>.</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Knox</surname>
<given-names>E. M.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>R. T.</given-names>
</name>
</person-group> &#x201c;<article-title>Algorithms for mining distance-based outliers in large datasets</article-title>,&#x201d; in <conf-name>Proceedings of the international conference on very large data bases</conf-name>, <conf-loc>San Francisco, CA, United States</conf-loc>, <conf-date>August 1998</conf-date>, <fpage>392</fpage>&#x2013;<lpage>403</lpage>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kontaki</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gounaris</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Papadopoulos</surname>
<given-names>A. N.</given-names>
</name>
<name>
<surname>Tsichlas</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Manolopoulos</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Efficient and flexible algorithms for monitoring distance-based outliers over data streams</article-title>. <source>Inf. Syst.</source> <pub-id pub-id-type="doi">10.1016/j.is.2015.07.006</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A scaled-MST-based clustering algorithm and application on image segmentation</article-title>. <source>J. Intelligent Inf. Syst.</source> <pub-id pub-id-type="doi">10.1007/s10844-019-00572-x</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Botta</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ionescu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Ecod: unsupervised outlier detection using empirical cumulative distribution functions</article-title>. <source>IEEE Trans. Knowl. Data Eng</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2201.00382</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>M.</given-names>
</name>
</person-group> &#x201c;<article-title>Minimum spanning tree based spatial outlier mining and its applications</article-title>,&#x201d; in <conf-name>Proceedings of the Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)</conf-name>, <conf-loc>Chengdu, China</conf-loc>, <conf-date>May 2008</conf-date>.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019a</year>). <article-title>Clustering with outlier removal</article-title>. <source>IEEE Trans. Knowl. data Eng.</source> <volume>33</volume>, <fpage>2369</fpage>&#x2013;<lpage>2379</lpage>. <pub-id pub-id-type="doi">10.1109/TKDE.2019.2954317</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Schultz</surname>
<given-names>T.</given-names>
</name>
</person-group> &#x201c;<article-title>How long are various types of daily activities? Statistical analysis of a multimodal wearable sensor-based human activity dataset</article-title>,&#x201d; in <conf-name>Proceedings of the 15th International Joint Conference on Biomedical Engineering Systems and Technologies (BIOSTEC 2022)</conf-name>, <conf-loc>Online Streaming</conf-loc>, <conf-date>February 2022</conf-date>. <pub-id pub-id-type="doi">10.5220/0010896400003123</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2019b</year>). <article-title>Generative adversarial active learning for unsupervised outlier detection</article-title>. <source>IEEE Trans. Knowl. Data Eng.</source>, <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1109/tkde.2019.2905606</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Manzoor</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Milajerdi</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Akoglu</surname>
<given-names>L.</given-names>
</name>
</person-group> &#x201c;<article-title>Fast memory-efficient anomaly detection in streaming heterogeneous graphs</article-title>,&#x201d; in <conf-name>Proceedings of the Acm Sigkdd International Conference on Knowledge Discovery &#x26; Data Mining</conf-name>, <conf-loc>San Francisco California USA</conf-loc>, <conf-date>August 2016</conf-date>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Medak</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Review and analysis of minimum spanning tree using prim&#x2019;s algorithm</article-title>. <source>Int. J. Comput. Sci. Trends Technol. (IJCST)</source> <volume>6</volume>.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pevn&#xfd;</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Loda: lightweight on-line detector of anomalies</article-title>. <source>Mach. Learn.</source> <volume>102</volume>, <fpage>275</fpage>&#x2013;<lpage>304</lpage>. <pub-id pub-id-type="doi">10.1007/s10994-015-5521-0</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Pham</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Pagh</surname>
<given-names>R.</given-names>
</name>
</person-group> &#x201c;<article-title>A near-linear time approximation algorithm for angle-based outlier detection in high-dimensional data</article-title>,&#x201d; in <conf-name>Proceedings of the 18th ACM SIGKDD international conference on Knowledge discovery and data mining</conf-name>, <conf-loc>Beijing China</conf-loc>, <conf-date>August 2012</conf-date>. <pub-id pub-id-type="doi">10.1145/2339530.2339669</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Radovanovi&#x107;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nanopoulos</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ivanovi&#x107;</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Reverse nearest neighbors in unsupervised distance-based outlier detection</article-title>. <source>IEEE Trans. Knowl. Data Eng.</source> <volume>27</volume>, <fpage>1369</fpage>&#x2013;<lpage>1382</lpage>. <pub-id pub-id-type="doi">10.1109/TKDE.2014.2365790</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramaswamy</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Rastogi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Shim</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>Efficient algorithms for mining outliers from large data sets</article-title>. <source>ACM SIGMOD Rec.</source> <volume>29</volume>, <fpage>427</fpage>&#x2013;<lpage>438</lpage>. <pub-id pub-id-type="doi">10.1145/342009.335437</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rodrigues</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Folgado</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Belo</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Schultz</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Gamboa</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Feature-based information retrieval of multimodal biosignals with a self-similarity matrix: focus on automatic segmentation</article-title>. <source>Biosensors</source> <volume>12</volume>, <fpage>1182</fpage>. <pub-id pub-id-type="doi">10.3390/bios12121182</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Schlegl</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Seeb&#xf6;ck</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Waldstein</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Schmidt-Erfurth</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Langs</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Unsupervised anomaly detection with generative adversarial networks to guide marker discovery</article-title>,&#x201d; in <source>Information processing in medical imaging</source>. <person-group person-group-type="editor">
<name>
<surname>Niethammer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Styner</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Aylward</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Oguz</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Yap</surname>
<given-names>P.-T.</given-names>
</name>
<etal/>
</person-group> (<publisher-loc>Cham, Germany</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>146</fpage>&#x2013;<lpage>157</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Schubert</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zimek</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kriegel</surname>
<given-names>H. P.</given-names>
</name>
</person-group> &#x201c;<article-title>Generalized outlier detection with flexible kernel density estimates</article-title>,&#x201d; in <conf-name>Proceedings of the SIAM International Conference on Data Mining 2014</conf-name>, <conf-date>April 2014</conf-date>, <fpage>542</fpage>&#x2013;<lpage>550</lpage>. <pub-id pub-id-type="doi">10.1137/1.9781611973440.63</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tseng</surname>
<given-names>V. S.</given-names>
</name>
<name>
<surname>Ying</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>K.</given-names>
</name>
</person-group> &#x201c;<article-title>Fraudetector: A graph-mining-based framework for fraudulent phone call detection</article-title>,&#x201d; in <conf-name>Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>, <conf-loc>Sydney NSW Australia</conf-loc>, <conf-date>August 2015</conf-date>, <fpage>2157</fpage>&#x2013;<lpage>2166</lpage>.</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X. L.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wilkes</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Enhancing minimum spanning tree-based clustering by removing density-based outliers</article-title>. <source>Digit. Signal Process.</source> <volume>23</volume>, <fpage>1523</fpage>&#x2013;<lpage>1538</lpage>. <pub-id pub-id-type="doi">10.1016/j.dsp.2013.03.009</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X. L.</given-names>
</name>
<name>
<surname>Wilkes</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>A minimum spanning tree-inspired clustering-based outlier detection technique</article-title>,&#x201d; in <source>Icdm</source> (<publisher-loc>Berlin, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>209</fpage>&#x2013;<lpage>223</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-642-31488-9_17</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y. F.</given-names>
</name>
<name>
<surname>Jiong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>G. P.</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>Y. R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A new outlier detection method based on OPTICS</article-title>. <source>Sustain. Cities Soc.</source> <volume>45</volume>, <fpage>197</fpage>&#x2013;<lpage>212</lpage>. <pub-id pub-id-type="doi">10.1016/j.scs.2018.11.031</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>H.</given-names>
</name>
</person-group> &#x201c;<article-title>A new local distance-based outlier detection approach for scattered real-world data</article-title>,&#x201d; in <conf-name>Proceedings of the Advances in Knowledge Discovery and Data Mining</conf-name>, <conf-loc>Bangkok, Thailand</conf-loc>, <conf-date>April 2009</conf-date>, <fpage>813</fpage>&#x2013;<lpage>822</lpage>.</citation>
</ref>
<ref id="B40">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> &#x201c;<article-title>Probabilistic-mismatch anomaly detection: do one&#x2019;s medications match with the diagnoses</article-title>,&#x201d; in <conf-name>Proceedings of the 2016 IEEE 16th International Conference on Data Mining (ICDM)</conf-name>, <conf-loc>Barcelona, Spain</conf-loc>, <conf-date>December 2016</conf-date>, <fpage>659</fpage>&#x2013;<lpage>668</lpage>.</citation>
</ref>
<ref id="B41">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hryniewicki</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Nasrullah</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Lscp: locally selective combination in parallel outlier ensembles</article-title>. <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1812.01528">https://arxiv.org/abs/1812.01528</ext-link>.</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Nasrullah</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Pyod: A python toolbox for scalable outlier detection</article-title>. <source>J. Mach. Learn. Res.</source> <volume>20</volume>, <fpage>1</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zong</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Min</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Lumezanu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Cho</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> &#x201c;<article-title>Deep autoencoding Gaussian mixture model for unsupervised anomaly detection</article-title>,&#x201d; in <conf-name>Proceedings of the 6th International Conference on Learning Representations</conf-name>, <conf-loc>Vancouver, BC, Canada</conf-loc>, <conf-date>April 2018</conf-date>.</citation>
</ref>
</ref-list>
</back>
</article>