<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Educ.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Education</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Educ.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2504-284X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/feduc.2025.1645911</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A comparison of three approaches for clustering polytomous data in the presence of masking variables</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Huang</surname> <given-names>Sijia</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/1665922"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Botter</surname> <given-names>Preston</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Sturm</surname> <given-names>Alexandra</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Education, Indiana University Bloomington</institution>, <city>Bloomington, IN</city>, <country country="us">United States</country></aff>
<aff id="aff2"><label>2</label><institution>Psychological Science, Loyola Marymount University</institution>, <city>Los Angeles, CA</city>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Sijia Huang, <email xlink:href="mailto:sijhuang@iu.edu">sijhuang@iu.edu</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-12">
<day>12</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>10</volume>
<elocation-id>1645911</elocation-id>
<history>
<date date-type="received">
<day>12</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>22</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Huang, Botter and Sturm.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Huang, Botter and Sturm</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-12">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>To uncover the heterogeneity in a population, it is common yet important to partition individuals into distinct subgroups based on their responses to items in measurement tools. Various approaches have been introduced to tackle this clustering problem in psychology and education. To provide more guidance to practitioners, in this study, we compareD the performance of three widely-applied approaches, including the latent class analysis (LCA), <italic>k</italic>-means and <italic>k</italic>-medians, in clustering polytomous items in the presence of masking variables. In the simulation conditions considered, we found that LCA coupled with Bayesian Information Criterion (BIC) outperformed other approaches and methods for determining the optimal number of subgroups. We also applied the three approaches to an empirical data set and obtained different conclusions regarding the number of subgroups. Additionally, we discussed the limitations of this study and future research directions.</p></abstract>
<kwd-group>
<kwd>latent class analysis</kwd>
<kwd><italic>k</italic>-means</kwd>
<kwd><italic>k</italic>-medians</kwd>
<kwd>clustering</kwd>
<kwd>polytomous data</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="2"/>
<table-count count="6"/>
<equation-count count="13"/>
<ref-count count="50"/>
<page-count count="12"/>
<word-count count="8509"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Assessment, Testing and Applied Measurement</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>To uncover and describe the heterogeneity in a population, it is a common yet important task to partition many individuals into a handful of distinct subgroups based on their responses to items in measurement tools. For instance, many studies in developmental psychology (e.g., <xref ref-type="bibr" rid="B4">Althoff et al., 2006</xref>; <xref ref-type="bibr" rid="B9">Basten et al., 2013</xref>; <xref ref-type="bibr" rid="B35">Nozadi et al., 2016</xref>; <xref ref-type="bibr" rid="B49">Wadsworth et al., 2001</xref>) have investigated the phenotype profiles of anxiety, depression, and attention-deficit/hyperactivity disorder (ADHD) using responses to items in Child Behavior Checklist (CBCL; <xref ref-type="bibr" rid="B1">Achenbach, 1999</xref>), a popular caregiver-report measure for assessing children&#x00027;s behavior problems. Additionally, in education, recent studies have identified subgroups of students with distinct online learning patterns (e.g., <xref ref-type="bibr" rid="B6">Araka et al., 2022</xref>; <xref ref-type="bibr" rid="B8">Barnard-Brak et al., 2010</xref>; <xref ref-type="bibr" rid="B11">Broadbent and Fuller-Tyszkiewicz, 2018</xref>) and school principles with different leadership types (e.g., <xref ref-type="bibr" rid="B2">Agasisti et al., 2019</xref>; <xref ref-type="bibr" rid="B5">Angela and Alex, 2014</xref>) using students&#x00027; and principles&#x00027; responses to survey questionnaires.</p>
<p>Various approaches have been introduced to tackle this problem of clustering a large number of individuals into non-overlapping subgroups. Based on if an explicit statistical model is invoked, these clustering approaches can be roughly divided into two categories: <italic>model-based</italic> and <italic>non-model-based</italic> approaches (<xref ref-type="bibr" rid="B14">Brusco et al., 2017</xref>). One of the most extensively applied model-based clustering approaches in psychology and education is the <italic>latent class analysis</italic> (LCA; e.g., <xref ref-type="bibr" rid="B27">Lazarsfeld, 1950a</xref>,<xref ref-type="bibr" rid="B28">b</xref>; <xref ref-type="bibr" rid="B21">Goodman, 1974</xref>; <xref ref-type="bibr" rid="B32">McCutcheon, 1987</xref>; <xref ref-type="bibr" rid="B48">Vermunt and Magidson, 2004</xref>), which can be viewed as a special case of finite mixture models (<xref ref-type="bibr" rid="B33">McLachlan and Peel, 2000</xref>) when data are categorical. LCA, along with other model-based clustering approaches, relies on statistical models and assumes that the observed data (i.e., item responses) are a mixture of several probability distributions. In contrast, non-model-based approaches, such as the <italic>k</italic>-means clustering (<xref ref-type="bibr" rid="B10">Bishop and Nasrabadi, 2006</xref>; <xref ref-type="bibr" rid="B29">MacQueen, 1967</xref>), <italic>k</italic>-medians clustering (<xref ref-type="bibr" rid="B25">Kaufman and Rousseeuw, 1990</xref>), and hierarchical cluster analysis (HCA; <xref ref-type="bibr" rid="B24">Johnson, 1967</xref>), determine how individuals are clustered based on measures of distance/dissimilarity between them.</p>
<p>Studies have been conducted to compare various clustering approaches to offer insight into their applications to different research context. For instance, (<xref ref-type="bibr" rid="B14">Brusco et al. 2017</xref>) evaluated the performance of LCA, <italic>k</italic>-means, and <italic>k</italic>-medians algorithms in clustering dichotomous data. They found that all the three approaches could well recover the cluster structures in the simulations but yielded different results when applied to a real data set. (<xref ref-type="bibr" rid="B31">Magidson and Vermunt 2002</xref>), also through a simulation study, compared LCA with <italic>k</italic>-means in clustering two continuous variables. (<xref ref-type="bibr" rid="B40">Schreiber and Pekarik 2014</xref>) applied LCA, <italic>k</italic>-means, and HCA to empirical data sets and concluded that LCA was statistically more rigorous than the other two approaches. (<xref ref-type="bibr" rid="B36">Papachristou et al. 2016</xref>) compared LCA with <italic>k</italic>-means and found that the two approaches identified similar clinical profiles based on a cancer symptoms data set.</p>
<p>To further improve the understanding of clustering approaches and provide guidance to practitioners, two issues need to be more closely investigated. First, measures in psychology and education typically consist of a large number of items; however, not all of them are essential for identifying subgroups in the population. Following (<xref ref-type="bibr" rid="B12">Brusco 2004</xref>) and (<xref ref-type="bibr" rid="B19">Fowlkes and Mallows 1983</xref>), we use the term <italic>true variables</italic> for items in a measure that define the true subgroup structures, and <italic>masking variables</italic> to refer to those that are not relevant. While several methods for selecting true variables for the subsequent analysis (e.g., <xref ref-type="bibr" rid="B13">Brusco and Cradit, 2001</xref>; <xref ref-type="bibr" rid="B16">Carmone et al., 1999</xref>) have been developed, it is yet unclear if and how masking variables would impact the performance of both model-based and non-model-based approaches for clustering individuals.</p>
<p>Secondly, most of the existing studies on clustering approaches have been focused on dichotomous or continuous variables. However, items in psychological measures and educational survey questionnaires generally have more than two response categories (i.e., polytomous variables). For instance, items in CBCL are statements of children and youth&#x00027;s behaviors (e.g., [if a child] <italic>acts too young for his/her age</italic>) and are rated on a 3-point Likert scale (<italic>0 = Not true, 1 = Somewhat or sometimes true</italic>, and <italic>2 = Very true or often true</italic>). To be able to choose among various clustering approaches, researchers and practitioners in psychology and education must know how these approaches perform when data are polytomous.</p>
<p>To address these two issues, the present study looked into the performance of clustering approaches for assigning individuals into subgroups based on polytomous data in the presence of masking variables. Specifically, through a comprehensive simulation study and an empirical data analysis, we compared three popular approaches, including LCA, <italic>k</italic>-means, and <italic>k</italic>-medians, and methods for selecting the optimal number of subgroups. The remainder of this paper is organized as follows: in Section 2, we reviewed the three clustering approaches to be discussed in this study and the associated cluster number selecting methods. In Section 3, we conducted a simulation study to compare the performance of the three approaches in recovering subgroup structures under various conditions. In Section 4, we applied the three clustering approaches to an empirical data set and compared the results. Lastly, in Section 5, we discussed limitations of the present study and pointed to future research directions.</p></sec>
<sec id="s2">
<label>2</label>
<title>Clustering approaches</title>
<p>In this section, we briefly reviewed the three popular clustering approaches to be compared in the present study, including LCA, <italic>k</italic>-means, and <italic>k</italic>-medians, along with methods for selecting the optimal number of subgroups for each of the approaches.</p>
<sec>
<label>2.1</label>
<title>Latent class analysis</title>
<p>Since first introduced by (<xref ref-type="bibr" rid="B27">Lazarsfeld 1950a</xref>,<xref ref-type="bibr" rid="B28">b</xref>), LCA has been undergoing significant development (e.g., <xref ref-type="bibr" rid="B21">Goodman, 1974</xref>; <xref ref-type="bibr" rid="B48">Vermunt and Magidson, 2004</xref>). Typically, LCA assumes that the population that is being studied is a mixture of two or more mutually exclusive subsgroups (i.e., classes). The subgroup memberships of individuals are unobserved and lead to different distributions of the observed variables. Let <bold><italic>x</italic><sub><italic>i</italic></sub></bold> &#x0003D; (<italic>x</italic><sub><italic>i</italic>1</sub>, &#x02026;, <italic>x</italic><sub><italic>iv</italic></sub>, &#x02026;, <italic>x</italic><sub><italic>iV</italic></sub>), where 1 &#x02264; <italic>v</italic> &#x02264; <italic>V</italic>, denote the observed responses of individual <italic>i</italic> (1 &#x02264; <italic>i</italic> &#x02264; <italic>N</italic>) to <italic>V</italic> polytomous items/variables, where <italic>N</italic> indicates the number of individuals in the sample. Invoking the conditional independence assumption, we have that for an individual <italic>i</italic> that belongs to subgroup <italic>k</italic> (1 &#x02264; <italic>k</italic> &#x02264; <italic>K</italic>), the probability that he/she/they have the response pattern <italic>x</italic><sub><italic>i</italic></sub> is</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mrow><mml:mi>P</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>i</mml:mi></mml:mstyle></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x0220F;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>V</mml:mi></mml:munderover><mml:mi>P</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math><label>(1)</label></disp-formula>
<p>where <italic>P</italic>(<italic>x</italic><sub><italic>ij</italic></sub>|<italic>c</italic><sub><italic>i</italic></sub> &#x0003D; <italic>k</italic>) is the probability that a subgroup <italic>k</italic> individual score <italic>x</italic><sub><italic>ij</italic></sub> for item <italic>j</italic>. Let <bold>&#x003BB;</bold> &#x0003D; (&#x003BB;<sub>1</sub>, &#x02026;, &#x003BB;<sub><italic>K</italic></sub>) be a vector that consists of the probabilities that an individual belongs to each of the <italic>K</italic> subgroups. The multivariate density of the individual <italic>i</italic> can then be specified as a mixture distribution,</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mrow><mml:mi>P</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>i</mml:mi></mml:mstyle></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>K</mml:mi></mml:munderover><mml:mrow><mml:msub><mml:mtext>&#x003BB;</mml:mtext><mml:mi>c</mml:mi></mml:msub></mml:mrow></mml:mstyle><mml:mi>P</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>i</mml:mi></mml:mstyle></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>K</mml:mi></mml:munderover><mml:mrow><mml:msub><mml:mtext>&#x003BB;</mml:mtext><mml:mi>c</mml:mi></mml:msub></mml:mrow></mml:mstyle><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x0220F;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>V</mml:mi></mml:munderover><mml:mi>P</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>.</mml:mo></mml:mrow></mml:math><label>(2)</label></disp-formula>
<p>Then the overall likelihood <italic>L</italic> is the product of the contribution of each individual,</p>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mrow><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x0220F;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mi>P</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>i</mml:mi></mml:mstyle></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x0220F;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>K</mml:mi></mml:munderover><mml:mrow><mml:msub><mml:mtext>&#x003BB;</mml:mtext><mml:mi>c</mml:mi></mml:msub></mml:mrow></mml:mstyle><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x0220F;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>V</mml:mi></mml:munderover><mml:mi>P</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mstyle><mml:mo>.</mml:mo></mml:mrow></mml:math><label>(3)</label></disp-formula>
<p>Estimates of model parameters are generally obtained through maximum likelihood (ML)-based estimation procedures, such as the expectation&#x02013;maximization (EM) algorithm.</p>
<sec>
<label>2.1.1</label>
<title>Determining <italic>K</italic></title>
<p>To determine the number of subgroups <italic>K</italic> in the population, a series of LCA models with different numbers of sub-groups are fitted to observed data and compared. Due to the lack of absolute fit indices, in practice, two information criteria, namely the Akaike information criterion (AIC; <xref ref-type="bibr" rid="B3">Akaike, 1973</xref>) and Bayesian information criterion (BIC; <xref ref-type="bibr" rid="B41">Schwarz, 1978</xref>) are often adopted to aid selecting the number of subgroups. Both AIC and BIC add penalty terms to the maximized likelihood of the model, denoted by <inline-formula><mml:math id="M4"><mml:mover accent="true"><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula>. Specifically, AIC factors in the number of parameters to reduce overfitting:</p>
<disp-formula id="EQ4"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">AIC</mml:mtext><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mn>2</mml:mn><mml:mo class="qopname">ln</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mn>2</mml:mn><mml:mi>f</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>where <italic>f</italic> refers to the number of free parameters in the model. BIC, on the other hand, penalizes for both the number of model parameters and the number of individuals in the sample <italic>N</italic>:</p>
<disp-formula id="EQ5"><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">BIC</mml:mtext><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mn>2</mml:mn><mml:mo class="qopname">ln</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>f</mml:mi><mml:mo class="qopname">ln</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula></sec></sec>
<sec>
<label>2.2</label>
<title><italic>k</italic>-means and <italic>k</italic>-medians</title>
<p>The <italic>k</italic>-means clustering is probably the most widely-applied unsupervised machine learning technique, and <italic>k</italic>-medians clustering can be viewed as one of its variants. Unlike LCA, <italic>k</italic>-means and <italic>k</italic>-medians do not assume any statistical models but assign individuals to the pre-specified number of subgroups through minimizing the sum of within-cluster variation. Specifically, with <italic>k</italic>-means, the variation within each cluster is computed as the sum of the squared Euclidean distance between each individual and its cluster center,</p>
<disp-formula id="EQ6"><mml:math id="M7"><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>i</mml:mi></mml:mstyle></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:munder><mml:msup><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>i</mml:mi></mml:mstyle></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x000AF;</mml:mo></mml:mover><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:math><label>(6)</label></disp-formula>
<p>where <inline-formula><mml:math id="M8"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the means of all individuals within subgroup <italic>k</italic>. On the other hand, the <italic>k</italic>-medians clustering uses the medians as cluster centers and is therefore more robust to outliers. A smaller <italic>W</italic><sub><italic>k</italic></sub> indicates that the individuals in the subgroup <italic>k</italic> are more homogeneous. Note that while the Euclidean distance is often considered the default of <italic>k</italic>-means and <italic>k</italic>-medians, other distance measures, such as the Manhattan distance, can also be used.</p>
<p>This minimization problem of <italic>k</italic>-means and <italic>k</italic>-medians is solved through applying the iterative algorithm shown in <xref ref-type="other" rid="algorithm_1">Algorithm 1</xref>. The algorithm starts with randomly assigning individuals to subgroups and updates the subgroup memberships based on individuals&#x00027; distance to the cluster centers. The algorithm stops when the membership assignments stop changing. This optimization algorithm may stop at the local optimum rather than the global optimum. Therefore, in practice, one needs to run the algorithm multiple times with different initial configurations and then select the clustering solution with the smallest sum of within-cluster variation.</p>
<statement content-type="algorithm" id="algorithm_1">
<label>Algorithm 1</label>
<p>An iterative algorithm for <italic>k</italic>-means and <italic>k</italic>-medians.
<preformat>
Initialize by randomly assigning <italic>N</italic> individuals to <italic>K</italic> clusters.
<bold>repeat</bold>
Compute the center of each of the <italic>K</italic> clusters.
Assign each individual to the cluster with the nearest center.
<bold>until</bold> The cluster assignments of all individuals stop changing.
</preformat>
</p>
</statement>
<sec>
<label>2.2.1</label>
<title>Determining <italic>K</italic></title>
<p>Various methods have been proposed to determine the optimal number of subgroups for both <italic>k</italic>-means and <italic>k</italic>-medians clustering. In this study, we focused on the widely-applied <italic>M</italic>aximum <italic>R</italic>atio of <italic>P</italic>ercentage <italic>C</italic>hanges (MRPC) for <italic>k</italic>-means, and the Calinski-Harabasz (CH) pseudo-<italic>F</italic> statistic and the Silhouette method for both <italic>k</italic>-means and <italic>k</italic>-medians.</p>
<p><italic>Maximum ratio of percentage changes</italic> (MRPC). The MRPC determines the optimal number of subgroups by calculating the ratio of the percentage change in the sum of within-cluster variation as the pre-specified subgroup number increases. Let <italic>W</italic>(<italic>K</italic>) denote the sum of within-cluster variation when the number of subgroups for <italic>k</italic>-means is specified as <italic>K</italic>, i.e., <inline-formula><mml:math id="M9"><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. The MRPC associated with <italic>K</italic> subgroups is then computed as:</p>
<disp-formula id="EQ7"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mi>R</mml:mi><mml:mi>P</mml:mi><mml:mi>C</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>/</mml:mo><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>/</mml:mo><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>A larger MRPC means that the ratio of change from <italic>K</italic>&#x02212;1 to <italic>K</italic> subgroups is larger than the ratio of change from <italic>K</italic> to <italic>K</italic>&#x0002B;1 subgroups. Therefore, the optimal number of subgroups is typically selected as the <italic>K</italic> that is associated the largest MRPC value.</p>
<p><italic>Calinski-Harabasz (CH) pseudo-F statistic</italic>. For <italic>k</italic>-means and <italic>k</italic>-medians, the CH pseudo-<italic>F</italic> statistic (Caliski and Harabasz, <xref ref-type="bibr" rid="B15">1974</xref>) assesses the quality of clustering by comparing the between-cluster variance to the within-cluster variance. Specifically, for <italic>K</italic> subgroups, it is computed as:</p>
<disp-formula id="EQ8"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>C</mml:mi><mml:mi>H</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>B</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>/</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>W</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>/</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>N</mml:mi><mml:mo>-</mml:mo><mml:mi>K</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>where <italic>B</italic>(<italic>K</italic>) represents the between-cluster variance. <italic>B</italic>(<italic>K</italic>) is computed as the weighted sum of the squared Euclidean distance between cluster centers and the center of all individuals, with a larger <italic>B</italic>(<italic>K</italic>) value suggests a more distinct separation of the <italic>K</italic> subgroups. Therefore, the number of subgroups <italic>K</italic> that has the largest CH pseudo-<italic>F</italic> statistic, which indicates that the <italic>K</italic> homogeneous subgroups are well separated, is selected as the optimal number of subgroups.</p>
<p><italic>Silhouette method</italic>. The Silhouette method (<xref ref-type="bibr" rid="B39">Rousseeuw, 1987</xref>) is another widely-applied way to evaluate the clustering quality for <italic>k</italic>-means and <italic>k</italic>-medians. Specifically, for each of the <italic>N</italic> individuals, the Silhouette width, denoted by <italic>s</italic><sub><italic>i</italic></sub>, which measures how similar an individual is to its assigned subgroup compared to other subgroups, is calculated as:</p>
<disp-formula id="EQ9"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mo class="qopname">max</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ9">Equation 9</xref>, <italic>a</italic><sub><italic>i</italic></sub> represents the average dissimilarity of individual <italic>i</italic> to all other individuals in its assigned subgroup,</p>
<disp-formula id="EQ10"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>&#x02260;</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>where <italic>C</italic><sub><italic>i</italic></sub> denotes a collection of individuals that are assigned to the same subgroup as individual <italic>i</italic>, |<italic>C</italic><sub><italic>i</italic></sub>| is the number of individuals in the subgroup, and <italic>d</italic>(<italic>i, j</italic>) represents the distance between two individuals, <italic>i</italic> and <italic>j</italic>. On the other hand, <italic>b</italic><sub><italic>i</italic></sub> is the minimum average dissimilarity of individual <italic>i</italic> to all individuals in any other subgroup, and it calculated as:</p>
<disp-formula id="EQ11"><mml:math id="M14"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x02260;</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mstyle><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>where <italic>C</italic><sub><italic>k</italic></sub> represents the set of individuals assigned to subgroup <italic>k</italic>, and |<italic>C</italic><sub><italic>k</italic></sub>| is the number of individuals in it.</p>
<p>A Silhouette score <italic>s</italic><sub><italic>i</italic></sub> close to 1 (<xref ref-type="bibr" rid="B25">Kaufman and Rousseeuw, 1990</xref>; <xref ref-type="bibr" rid="B39">Rousseeuw, 1987</xref>) indicates that the individual <italic>i</italic> is well clustered in the sense that it is closer to other individuals in the same subgroup than to those in other subgroups. In the contrast, a <italic>s</italic><sub><italic>i</italic></sub> value that is close to 0 suggests that the individual is on or near the decision boundary between two neighboring subgroups. Additionally, a <italic>s</italic><sub><italic>i</italic></sub> near -1 indicates that the individual may have been assigned to the wrong subgroup, since it is closer to individuals in other subgroups than to its assigned subgroup.</p>
<p>The average Silhouette score <inline-formula><mml:math id="M15"><mml:mover accent="true"><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> reflects the overall quality of the clustering solution and is often use to determine the number of optimal subgroups. It is computed as the average of individual <italic>s</italic><sub><italic>i</italic></sub>:</p>
<disp-formula id="EQ12"><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>Common rules of thumb from empirical studies that examine the absolute fit of clustering models suggest that a <inline-formula><mml:math id="M17"><mml:mover accent="true"><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> greater than 0.7 indicates a good clustering quality (<xref ref-type="bibr" rid="B25">Kaufman and Rousseeuw, 1990</xref>). In this study, our primary concern is the relative fit; therefore, the optimal number of subgroups is selected as the number of subgroups that maximizes the <inline-formula><mml:math id="M18"><mml:mover accent="true"><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula>.</p></sec></sec>
<sec>
<label>2.3</label>
<title>Other methods for determining <italic>K</italic></title>
<p>In addition to AIC and BIC for LCA, and MRPC, CH pseudo-<italic>F</italic> statistic and Silhouette method for <italic>k</italic>-means and <italic>k</italic>-medians, there are other methods for determining the optimal number of subgroups while tackling the clustering tasks.</p>
<p><italic>Sample-size adjusted BIC</italic>. The sample-size adjusted BIC (SABIC; <xref ref-type="bibr" rid="B42">Sclove, 1987</xref>) is another popular information criterion and is used for choosing the number of subgroups when LCA is applied. SABIC is similar to BIC; however, the penalty on <italic>N</italic> is reduced,</p>
<disp-formula id="EQ13"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">SABIC</mml:mtext><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mn>2</mml:mn><mml:mo class="qopname">ln</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>24</mml:mn></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<p><italic>Gap statistic</italic>. For <italic>k</italic>-means and <italic>k</italic>-medians, the Gap statistic (<xref ref-type="bibr" rid="B47">Tibshirani et al., 2002</xref>) is another option for determining the optimal number of subgroups. The Gap statistic compares the curve of log(<italic>W</italic>[<italic>K</italic>]) to the reference curve obtained from data uniformly distributed over a rectangle containing the data. Then the optimal number of subgroups is chosen as the <italic>K</italic> associated with the largest difference between the two curves.</p></sec></sec>
<sec id="s3">
<label>3</label>
<title>Simulation study</title>
<p>To compare the performance of LCA, <italic>k</italic>-means and <italic>k</italic>-median approaches, along with their associated methods for determining the optimal number of subgroups, we conducted a comprehensive simulation. Specifically, we considered items with three response categories, which represent a wide range of items that are often seen in psychological assessments and educational survey questionnaires. Examples of three-category items include items that measure the frequencies of certain behaviors (0 = <italic>Never</italic>, 1 = <italic>Sometimes</italic>, and 2 = <italic>Usually</italic>) and items that inquire about the attitude toward certain statements (0 = <italic>Disagree</italic>, 1 = <italic>Neutral</italic>, and 2 = <italic>Agree</italic>).</p>
<sec>
<label>3.1</label>
<title>Data generation</title>
<p>To simulate data, we followed the process used in previous simulation studies (e.g., <xref ref-type="bibr" rid="B12">Brusco, 2004</xref>; <xref ref-type="bibr" rid="B14">Brusco et al., 2017</xref>; <xref ref-type="bibr" rid="B17">Dimitriadou et al., 2002</xref>), and manipulated six factors, including (1) the number of individuals <italic>N</italic>, (2) the number of subgroups <italic>K</italic>, (3) the number of true variables <italic>V</italic>, (4) the number of masking variables <italic>P</italic>, (5) the error level in the subgroup structure &#x003F5;, and (6) the densities of subgroups.</p>
<p>Motivated by previous studies, we generated item responses in the following three steps:</p>
<list list-type="bullet">
<list-item><p>First, we obtained values of <italic>V</italic> true variables based on the perfect subgroup structure (shown in <xref ref-type="table" rid="T1">Tables 1</xref>&#x02013;<xref ref-type="table" rid="T3">3</xref>), along with the number of individuals <italic>N</italic> and cluster probabilities. For instance, in conditions with <italic>V</italic> = 10, <italic>K</italic> = 2, <italic>N</italic> = 250 and equal subgroup probabilities, 125 (250/2) individuals would be assigned with responses presented in the first row of <xref ref-type="table" rid="T1">Table 1</xref>, while the other 125 individuals had responses in the second row.</p></list-item>
<list-item><p>In Step 2, we added errors to the item responses from Step 1 by randomly selecting &#x003F5; of the data points and changing their values. For instance, in conditions with &#x003F5; = 5%, we would randomly select 5% &#x000D7; <italic>N</italic>&#x000D7;<italic>V</italic> item responses and change their values from the perfect subgroup structure. For selected responses with values of 1, we would change their values to either 0 or 2 with the same probability of 0.5.</p></list-item>
<list-item><p>In the last step, we generated values of <italic>P</italic> masking variables, when applicable. In other words, additional <italic>N</italic>&#x000D7;<italic>P</italic> item responses were generated by randomly sampling from the three possible values, 0, 1, and 2. Then values of masking variables were appended to the noise-corrupted data from Step 2.</p></list-item>
</list>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Cluster structures for <italic>V</italic> &#x0003D; 10.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold><italic>K</italic></bold></th>
<th valign="top" align="center" colspan="10"><bold>Variable values</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 3</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 4</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>In <italic>K</italic> &#x0003D; 2 conditions, the distance between the two sub-groups was 3.16. In <italic>K</italic> &#x0003D; 3 conditions, the distances between pairs of the three sub-groups were 2.24, 3.16, and 2.24. In <italic>K</italic> &#x0003D; 4 conditions, the distances between pairs of sub-groups were 2.24, 3.46, 2.24, 5.00, 3.74, and 2.24.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Cluster structures for <italic>V</italic> &#x0003D; 20.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold><italic>K</italic></bold></th>
<th valign="top" align="center" colspan="20"><bold>Variable values</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 3</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 4</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>In <italic>K</italic> &#x0003D; 2 conditions, the distance between the two sub-groups was 4.47. In <italic>K</italic> &#x0003D; 3 conditions, the distances between pairs of the three sub-groups were 3.16, 4.47, and 3.16. In <italic>K</italic> &#x0003D; 4 conditions, the distances between pairs of sub-groups were 3.16, 4.90, 3.16, 7.07, 5.29, and 3.16.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Cluster structures for <italic>V</italic> &#x0003D; 30.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold><italic>K</italic></bold></th>
<th valign="top" align="center" colspan="30"><bold>Variable values</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 3</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic> &#x0003D; 4</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>In <italic>K</italic> &#x0003D; 2 conditions, the distance between the two sub-groups was 5.48. In <italic>K</italic> &#x0003D; 3 conditions, the distances between pairs of the three sub-groups were 3.87, 5.48, and 3.87. In <italic>K</italic> &#x0003D; 4 conditions, the distances between pairs of sub-groups were 3.87, 6.00, 3.87, 8.66, 6.48, and 3.87.</p>
</table-wrap-foot>
</table-wrap>
<p>Sample <monospace>R</monospace> code for data generation were included in online <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>.</p>
<p><bold>Number of individuals</bold> <bold>N</bold>. We considered three levels of the number of individuals, <italic>N</italic> = 250, 500, and 1,000. These numbers are generally viewed as moderately large/large sample sizes for LCA, <italic>k</italic>-means and <italic>k</italic>-medians clustering.</p>
<p><bold>Number of subgroups</bold> <bold>K</bold>. While generating values of true variables, we considered three numbers of subgroups, <italic>K</italic> = 2, 3, and 4. The cluster numbers were chosen based on previous simulation studies (e.g., <xref ref-type="bibr" rid="B12">Brusco, 2004</xref>; <xref ref-type="bibr" rid="B14">Brusco et al., 2017</xref>; <xref ref-type="bibr" rid="B17">Dimitriadou et al., 2002</xref>).</p>
<p><bold>Number of true variables</bold> <bold>V</bold>. In this simulation study, we considered three numbers of true variables (i.e., variables that define the subgroup structure), <italic>V</italic> = 10, 20, and 30. These numbers covered a wide range the lengths of psychological assessments and educational survey questionnaires.</p>
<p><bold>Number of masking variables</bold> <bold>P</bold>. In practice, it is possible that not all variables in a measure define the subgroup structure. Therefore, in addition to conditions where there exist no masking variable (i.e., <italic>P</italic> = 0), we considered two levels of the number of masking variables, <italic>P</italic> = 5 and 10.</p>
<p><bold>Level of error</bold> <bold>&#x003F5;</bold>. Following previous simulation studies, we considered three levels of errors in Step 2 of the data generation process, &#x003F5; = 5%, 10%, and 20%.</p>
<p><bold>Subgroup probabilities</bold>. We considered three types of cluster probabilities, one of which represents cases with equal probability and two for unequal probability conditions. Specifically, in the equal probability conditions, all subgroups are of the same size so that the probability that an individual belong to one of the <italic>K</italic> subgroups in 1/<italic>K</italic>. In the first type of unequal probability conditions, the first subgroup had a probability of 0.6, and the other <italic>K</italic>&#x02212;1 subgroups have the same probability of (1 &#x02212; 0.4)/(<italic>K</italic>&#x02212;1). In the second type of unequal probability conditions, the first subgroup had a probability of 0.1, and other clusters have the probability of (1 &#x02212; 0.1)/(<italic>K</italic>&#x02212;1).</p>
<p>In sum, these six manipulated factors led to a total of 3 &#x000D7; 3 &#x000D7; 3 &#x000D7; 3 &#x000D7; 3 &#x000D7; 3 &#x0003D; 729 simulation conditions. For each of the conditions, we simulated and analyzed 50 data sets in <monospace>R</monospace> (<xref ref-type="bibr" rid="B37">R Core Team, 2023</xref>).</p></sec>
<sec>
<label>3.2</label>
<title>Data analysis</title>
<p>We analyzed each simulated data set using the three approaches of LCA, <italic>k</italic>-means and <italic>k</italic>-medians clustering, extracting 1 to 6 subgroups, and chose the optimal number of subgroups based on different methods. Specifically, we performed LCA using the <monospace>PoLCA</monospace> package (<xref ref-type="bibr" rid="B18">Drew and Jeffrey, 2011</xref>) and extracted the AIC and BIC indices. For <italic>k</italic>-means, we used the <monospace>R</monospace> package <monospace>cluster</monospace> (<xref ref-type="bibr" rid="B30">Maechler et al., 2013</xref>) for clustering and computed the MRPC, CH pseudo-<italic>F</italic> statistic, and Silhouette statistic based on the outputs. For <italic>k</italic>-medians clustering, we used the <monospace>Kmedians</monospace> package (<xref ref-type="bibr" rid="B20">Godichon-Baggioni, 2022</xref>) for subgroup extraction and obtained the CH pseudo-<italic>F</italic> statistic and Silhouette statistic using the <monospace>cluster</monospace> package. For all three approaches and numbers of subgroups, the numbers of random starting values/configurations were fixed at 100, the max numbers of iterations were set to 1,000, unless otherwise specified defaults were used.</p></sec>
<sec>
<label>3.3</label>
<title>Evaluation criteria</title>
<p>In this study, we focused on the key indicator of subgroup partition recovery to evaluate the three clustering approaches and their associated methods for choosing the number of subgroups. Following previous simulation studies, for each simulation condition, we computed the <italic>A</italic>djusted <italic>R</italic>and <italic>I</italic>ndex (ARI; <xref ref-type="bibr" rid="B23">Hubert, 1974</xref>; <xref ref-type="bibr" rid="B46">Steinley, 2004</xref>) between the true subgroup memberships and predicted partitions for each combination of clustering approaches and methods for choosing the subgroup number using the <monospace>Kmedians</monospace> package (<xref ref-type="bibr" rid="B7">Azzalini and Menardi, 2014</xref>). The ARI index quantifies the agreement between two partitions and ranges from 0 to 1, with a larger value indicating a higher level of agreement. We then performed the analysis of variance (ANOVA) to investigate how the ARI was impacted by the clustering approaches and the manipulated factors. We considered main effects of the six manipulated factors and all two-way interactions. To better inform about the performances of these approaches, we also plotted the determined number of subgroups for each combination of clustering approaches and methods of choosing subgroup number.</p></sec>
<sec>
<label>3.4</label>
<title>Results</title>
<p>As shown in <xref ref-type="table" rid="T4">Table 4</xref>, the ARIs of the seven combinations of clustering approaches and methods for selecting the optimal subgroup number (i.e., <italic>Methods</italic>) were significantly different (<italic>F</italic> &#x0003D; 375.6, <italic>df</italic> &#x0003D; 6, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> &#x0003D; 0.31). The <italic>post hoc</italic> Tukey&#x00027;s HSD test indicated that LCA outperformed the <italic>k</italic>-means and <italic>k</italic>-medians clustering, with BIC led to a greater ARI than AIC. Additionally, <italic>k</italic>-means coupled with the MRPC performed significantly better than other four methods, including <italic>k</italic>-means and <italic>k</italic>-medians coupled with the CH Pseudo-<italic>F</italic> statistic and the Silhouette method.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Analysis of variance (ANOVA) for ARI.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Source</bold></th>
<th valign="top" align="center"><bold><italic>SS</italic></bold></th>
<th valign="top" align="center"><bold><italic>df</italic></bold></th>
<th valign="top" align="center"><bold><italic>F</italic>-value</bold></th>
<th valign="top" align="center"><bold><italic>p</italic>-value</bold></th>
<th valign="top" align="center"><bold>Partial &#x003B7;<sup>2</sup></bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Methods</td>
<td valign="top" align="center">38.6</td>
<td valign="top" align="center">6</td>
<td valign="top" align="center">375.6</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.31</td>
</tr>
<tr>
<td valign="top" align="left">Sample size <italic>N</italic></td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">Number of clusters <italic>K</italic></td>
<td valign="top" align="center">46.3</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">1352.7</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.35</td>
</tr>
<tr>
<td valign="top" align="left">Number of true variables <italic>V</italic></td>
<td valign="top" align="center">10.2</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">299.2</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.11</td>
</tr>
<tr>
<td valign="top" align="left">Number of masking variables <italic>P</italic></td>
<td valign="top" align="center">3.4</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">99.8</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.04</td>
</tr>
<tr>
<td valign="top" align="left">Error level <italic>E</italic></td>
<td valign="top" align="center">19.5</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">568.2</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.18</td>
</tr>
<tr>
<td valign="top" align="left">Density <italic>D</italic></td>
<td valign="top" align="center">2.3</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">65.9</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.03</td>
</tr>
<tr>
<td valign="top" align="left">N &#x000D7; K</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">0.3</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">N &#x000D7; V</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">1.1</td>
<td valign="top" align="center">0.37</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">N &#x000D7; P</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">N &#x000D7; E</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">N &#x000D7; D</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">0.4</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">K &#x000D7; V</td>
<td valign="top" align="center">0.8</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">11.0</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.01</td>
</tr>
<tr>
<td valign="top" align="left">K &#x000D7; P</td>
<td valign="top" align="center">0.7</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">10.3</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.01</td>
</tr>
<tr>
<td valign="top" align="left">K &#x000D7; E</td>
<td valign="top" align="center">1.1</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">16.1</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.01</td>
</tr>
<tr>
<td valign="top" align="left">K &#x000D7; D</td>
<td valign="top" align="center">3.9</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">56.2</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.04</td>
</tr>
<tr>
<td valign="top" align="left">V &#x000D7; P</td>
<td valign="top" align="center">0.3</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">4.7</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">V &#x000D7; E</td>
<td valign="top" align="center">3.0</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">44.1</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.03</td>
</tr>
<tr>
<td valign="top" align="left">V &#x000D7; D</td>
<td valign="top" align="center">2.4</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">34.8</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.03</td>
</tr>
<tr>
<td valign="top" align="left">P &#x000D7; E</td>
<td valign="top" align="center">1.1</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">16.0</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.01</td>
</tr>
<tr>
<td valign="top" align="left">P &#x000D7; D</td>
<td valign="top" align="center">0.5</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">7.0</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.01</td>
</tr>
<tr>
<td valign="top" align="left">E &#x000D7; D</td>
<td valign="top" align="center">2.1</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">30.0</td>
<td valign="top" align="center">&#x0003C; 0.01<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.02</td>
</tr>
<tr>
<td valign="top" align="left">Residuals</td>
<td valign="top" align="center">86.0</td>
<td valign="top" align="center">5,024</td>
<td/>
<td/>
<td/>
</tr></tbody>
</table>
<table-wrap-foot>
<p><italic>SS</italic> denotes sum of squares. The Methods represents the combinations of clustering approaches and selection methods. <sup>&#x0002A;&#x0002A;&#x0002A;</sup> indicates <italic>p</italic>-value &#x0003C; 0.001.</p>
</table-wrap-foot>
</table-wrap>
<p>Besides, five out of the six manipulated simulation factors showed significant main effects on the ARI, including the number of clusters <italic>K</italic> (<italic>F</italic> &#x0003D; 1352.7, <italic>df</italic> &#x0003D; 2, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> &#x0003D; 0.35), the number of true variables <italic>V</italic> (<italic>F</italic> &#x0003D; 299.2, <italic>df</italic> &#x0003D; 2, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> &#x0003D; 0.11), the number of masking variables <italic>P</italic> (<italic>F</italic> &#x0003D; 99.8, <italic>df</italic> &#x0003D; 2, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> &#x0003D; 0.04), the error level <italic>E</italic> (<italic>F</italic> &#x0003D; 568.2, <italic>df</italic> &#x0003D; 2, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> &#x0003D; 0.18), and the cluster density <italic>D</italic> (<italic>F</italic> &#x0003D; 65.9, <italic>df</italic> &#x0003D; 2, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> &#x0003D; 0.03). Lastly, ten two-way interactions were significant; however, the effect sizes were all smaller than 0.04.</p>
<p><xref ref-type="fig" rid="F1">Figure 1</xref> presents the aggregated ARI values by three out of the six manipulated factors with effect sizes that were greater than 0.1, including the number of clusters <italic>K</italic>, the number of true variables <italic>V</italic>, and the error level <italic>E</italic>. A general trend was that the ARI value decreases as <italic>K</italic> increases, <italic>V</italic> decreases, and <italic>E</italic> decreases. For instance, as shown in <xref ref-type="fig" rid="F1">Figure 1b</xref>, when <italic>V</italic> increases, the ARI value of <italic>k</italic>-means coupled with MRPC improved from 0.7 to 0.93. Another example is that, as shown in <xref ref-type="fig" rid="F1">Figure 1c</xref>, when <italic>E</italic> increases, the ARI of <italic>k</italic>-medians coupled with CH Pseudo-<italic>F</italic> decreased from 0.81 to 0.66. It is also worth noting that the LCA approach had strong subgroup recovery performance in <italic>V</italic> &#x0003D; 10 and <italic>E</italic> &#x0003D; 0.2 conditions, among which the lowest ARI value was 0.81.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Aggregated ARI values by different factors. <bold>(a)</bold> By number of clusters <italic>K</italic>. <bold>(b)</bold> By number of true variables <italic>V</italic>. <bold>(c)</bold> By number of error level <italic>E</italic>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="feduc-10-1645911-g0001.tif">
<alt-text content-type="machine-generated">Bar charts comparing the Adjusted Rand Index (ARI) of six clustering methods under different conditions. (a) Varying the number of clusters k shows different ARI scores for K = 2, 3, and 4, with values ranging from 0.55 to 1.00. (b) Varying true variables V from 10 to 30 results in ARI ranging from 0.68 to 1.00. (c) Varying error levels E from 0.05 to 0.2, showing ARI from 0.66 to 1.00. Each chart includes methods like k-means, k-medians, and LCA evaluated by CH, MRPC, AIC, BIC, and Pseudo-F criteria.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F2">Figure 2</xref> shows boxplots of the determined numbers of subgroups by combinations of clustering approaches and associated methods, aggregated over all simulation factors other than <italic>K</italic>. As shown in <xref ref-type="fig" rid="F2">Figure 2a</xref>, when <italic>K</italic> &#x0003D; 2, LCA coupled with BIC and <italic>k</italic>-means and <italic>k</italic>-medians tend to choose 2 as the optimal number of sub-groups; while LCA coupled with AIC overestimated <italic>K</italic>. When the true number of subgroups were 3 or 4, LCA coupled with BIC was still able to select the correct subgroup number; however, <italic>k</italic>-means and <italic>k</italic>-medians coupled with MRPC and CH Pseudo-<italic>F</italic> tend to underestimated <italic>K</italic> and selected 2 as the number of subgroups, while LCA coupled with AIC favored more subgroups.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Boxplot of numbers of subgroups selected based on different methods. <bold>(a)</bold> True <italic>K</italic> = 2. <bold>(b)</bold> True <italic>K</italic> = 3. <bold>(c)</bold> True <italic>K</italic> = 4.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="feduc-10-1645911-g0002.tif">
<alt-text content-type="machine-generated">Box plots displaying estimated K values for clustering methods including k-means and k-medians with CH Pseudo-F, MRPC, Silhouette, and LCA with AIC and BIC. Panels represent true K values of two, three, and four.</alt-text>
</graphic>
</fig>
</sec></sec>
<sec id="s4">
<label>4</label>
<title>Empirical illustration</title>
<p>To further compare the three approaches, LCA, <italic>k</italic>-means and <italic>k</italic>-medians, along with the methods for selecting the number of subgroups, we analyzed an empirical data set from the Simon Foundation Powering Autism Research for Knowledge (SPARK; <xref ref-type="bibr" rid="B44">SPARK Consortium, 2018</xref>), a national research initiative funded by the Simons Foundation Autism Research Initiative (SFARI).</p>
<sec>
<label>4.1</label>
<title>Data</title>
<p>We examined a parent-rated measurement instrument named the <italic>Vineland Adaptive Behavior Scales Third Edition</italic> (Vineland-3; <xref ref-type="bibr" rid="B45">Sparrow et al., 2016</xref>) in the SPARK data base. The Vineland-3 is used as a measure of adaptive skills and is often a part of diagnostic evaluations for intellectual and developmental disabilities. Participant data were drawn from the SPARK data repository that includes phenotype and genotype data for individuals with an autism spectrum diagnosis. Thus, participants included in the empirical illustration all had a diagnosis of an autism spectrum disorder (ASD) and were all under the age of 18.</p>
<p>In this analysis, we focused on one of the 13 subdomains in Vineland-3, the <italic>Domestic</italic> subdomain, which includes 30 items. Items in this sub-domain measure a child&#x00027;s ability to perform household tasks (e.g., <italic>[The individual] is careful when using sharp objects, for example, scissor, knives</italic>). Parents rated the children&#x00027;s behaviors on a 3-point Likert-type scale (0 = <italic>Never</italic>, 1 = <italic>Sometimes</italic>, and 2 = <italic>Usually or often</italic>). A sample of <italic>N</italic> &#x0003D; 500 individuals were randomly selected from complete observations for the analysis.</p></sec>
<sec>
<label>4.2</label>
<title>Analysis</title>
<p>We performed LCA, <italic>k</italic>-means and <italic>k</italic>-medians clustering on the sample data in <monospace>R</monospace> (<xref ref-type="bibr" rid="B37">R Core Team, 2023</xref>). Specifically, for each of the three approaches, we ran a series of analyses with different numbers of subgroups, ranging from 1 to 10. Based on the outputs, we chose the optimal number of subgroups using various methods, including AIC and BIC for LCA, and MPRC, CH pseudo-<italic>F</italic> and Silhouette statistics for <italic>k</italic>-means and <italic>k</italic>-medians. Then we computed the agreements between partitions with the subgroups numbers determined by different methods.</p></sec>
<sec>
<label>4.3</label>
<title>Results</title>
<p><xref ref-type="table" rid="T5">Table 5</xref> summarizes various indices for the three approaches with different numbers of subgroups. The first observation is that the number of subgroups selected based on different approaches varied. Based on LCA coupled with AIC, the number of subgroups was 9, and the selected number of subgroups was 3 when BIC was applied. On the other hand, the numbers of subgroups determined by <italic>k</italic>-means and <italic>k</italic>-medians coupled with MRPC, CH pseudo-<italic>F</italic> and Silhouette statistics were all 2.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Summary of empirical data analysis results.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Approach</bold></th>
<th valign="top" align="center"><bold>Method</bold></th>
<th valign="top" align="center" colspan="10"><bold>Clusters</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td/>
<td/>
<td valign="top" align="center"><bold>1</bold></td>
<td valign="top" align="center"><bold>2</bold></td>
<td valign="top" align="center"><bold>3</bold></td>
<td valign="top" align="center"><bold>4</bold></td>
<td valign="top" align="center"><bold>5</bold></td>
<td valign="top" align="center"><bold>6</bold></td>
<td valign="top" align="center"><bold>7</bold></td>
<td valign="top" align="center"><bold>8</bold></td>
<td valign="top" align="center"><bold>9</bold></td>
<td valign="top" align="center"><bold>10</bold></td>
</tr>
<tr>
<td valign="top" align="left">LCA</td>
<td valign="top" align="center">AIC</td>
<td valign="top" align="center">27,422.21</td>
<td valign="top" align="center">26,391.38</td>
<td valign="top" align="center">25,976.80</td>
<td valign="top" align="center">25,720.56</td>
<td valign="top" align="center">25,567.88</td>
<td valign="top" align="center">25,466.04</td>
<td valign="top" align="center">25,396.14</td>
<td valign="top" align="center">25,344.27</td>
<td valign="top" align="center">25,313.24</td>
<td valign="top" align="center">25,331.56</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BIC</td>
<td valign="top" align="center">27,675.09</td>
<td valign="top" align="center">26,901.34</td>
<td valign="top" align="center">26,743.86</td>
<td valign="top" align="center">26,744.71</td>
<td valign="top" align="center">26,849.12</td>
<td valign="top" align="center">27,004.37</td>
<td valign="top" align="center">27,191.57</td>
<td valign="top" align="center">27,396.78</td>
<td valign="top" align="center">27,622.85</td>
<td valign="top" align="center">27,898.26</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic>-means</td>
<td valign="top" align="center">W</td>
<td valign="top" align="center">6,170.95</td>
<td valign="top" align="center">5,500.13</td>
<td valign="top" align="center">5,168.88</td>
<td valign="top" align="center">4,966.18</td>
<td valign="top" align="center">4,802.85</td>
<td valign="top" align="center">4,697.76</td>
<td valign="top" align="center">4,610.65</td>
<td valign="top" align="center">4,532.04</td>
<td valign="top" align="center">4,455.32</td>
<td valign="top" align="center">4,383.20</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">Pseudo-<italic>F</italic></td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">60.74</td>
<td valign="top" align="center">48.18</td>
<td valign="top" align="center">40.11</td>
<td valign="top" align="center">35.25</td>
<td valign="top" align="center">30.98</td>
<td valign="top" align="center">27.81</td>
<td valign="top" align="center">25.42</td>
<td valign="top" align="center">23.63</td>
<td valign="top" align="center">22.21</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">Silhouette</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">0.10</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.07</td>
<td valign="top" align="center">0.07</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.06</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic>-medians</td>
<td valign="top" align="center">Pseudo-<italic>F</italic></td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">60.73</td>
<td valign="top" align="center">48.14</td>
<td valign="top" align="center">40.03</td>
<td valign="top" align="center">35.19</td>
<td valign="top" align="center">30.90</td>
<td valign="top" align="center">27.71</td>
<td valign="top" align="center">25.15</td>
<td valign="top" align="center">23.38</td>
<td valign="top" align="center">21.65</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">Silhouette</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">0.10</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.07</td>
<td valign="top" align="center">0.07</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.06</td>
<td valign="top" align="center">0.05</td>
</tr></tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="T6">Table 6</xref> summarizes the ARIs between subgroup partitions determined based on different methods. The agreements between LCA coupled with AIC and other methods were low, with the ARI value ranging from 0.15 to 0.24. The agreements of LCA coupled with BIC and <italic>k</italic>-means and <italic>k</italic>-medians were moderate. The ARI between <italic>k</italic>-means and <italic>k</italic>-medians was 0.98, suggesting a high level of agreement between the two partitions.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>ARI between partitions based on different methods.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="center"><bold>LCA-AIC</bold></th>
<th valign="top" align="center"><bold>LCA-BIC</bold></th>
<th valign="top" align="center"><bold><italic>K</italic>-means</bold></th>
<th valign="top" align="center"><bold><italic>K</italic>-medians</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">LCA-BIC</td>
<td valign="top" align="center">0.24</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">0.51</td>
<td valign="top" align="center">0.51</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic>-means</td>
<td valign="top" align="center">0.15</td>
<td valign="top" align="center">0.51</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">0.98</td>
</tr>
<tr>
<td valign="top" align="left"><italic>K</italic>-medians</td>
<td valign="top" align="center">0.15</td>
<td valign="top" align="center">0.51</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">&#x02013;</td>
</tr></tbody>
</table>
</table-wrap>
</sec></sec>
<sec sec-type="discussion" id="s5">
<label>5</label>
<title>Discussion</title>
<p>In psychology and education, assigning a large number of individuals to a handful of subgroups based on their responses to items in measurement scales has long been an important task to understand the population heterogeneity. To better choose between various model-based and non-model-based clustering approaches, it is vital for researchers to investigate the performance of these approaches, along with the methods for determining the optimal number of subgroups, under different scenarios.</p>
<p>In this study, we compared the performance of three widely-applied approaches, LCA, <italic>k</italic>-means, and <italic>k</italic>-medians, in clustering individuals based on polytomous item responses in the presence of masking variables. Previous studies that compared different clustering approaches have been primarily focused on either binary (e.g., <xref ref-type="bibr" rid="B14">Brusco et al., 2017</xref>) or continuous variables (e.g., <xref ref-type="bibr" rid="B31">Magidson and Vermunt, 2002</xref>). Our study extended this line of work by focusing on polytomous variables, specifically items with three categories, which are common in psychological and educational measurement but less frequently researched. Recent work by (<xref ref-type="bibr" rid="B22">Haslbeck et al. 2023</xref>) showed that when ordinal responses are treated as continuous, Gaussian mixture models (GMM) can still recover the correct number of components if there are enough categories and variables; however, parameter estimates (particularly means and covariances) remain biased regardless of the sample size. This finding highlighted that results obtained with continuous or binary indicators may not directly generalize to polytomous data. Moreover, while latent profile analysis (LPA; a GMM with a constrained covariance matrix) is typically applied to continuous indicators and LCA is applied to categorical indicators, more research is needed to determine how consistently these approaches perform across items with different number of categories and study conditions.</p>
<p>We found that, via the simulation, LCA coupled with BIC generally provided the most accurate subgroup recovery. In contrast, <italic>k</italic>-means, <italic>k</italic>-medians can be more sensitive to simulation design factors, such as the true number of subgroups, the presence of masking variables and noise in the true variables. For instance, in <italic>K</italic> &#x0003D; 2 conditions, the average ARI of with <italic>k</italic>-means and <italic>k</italic>-medians and their associated methods for choosing subgroup number were all above 0.9. When the number of true variables was relatively large (e.g., <italic>V</italic> &#x0003D; 30), <italic>k</italic>-medians coupled with the CH Pseduo-<italic>F</italic> could still achieve an average ARI of 0.78. Our findings were not consistent with studies that examined the performance of clustering approaches for dichotomous data. For instance, (<xref ref-type="bibr" rid="B14">Brusco et al. 2017</xref>) showed that LCA and <italic>k</italic>-medians were competitive for clustering dichotomous data, while our study suggests LCA outperforms the other two approaches when items have three categories.</p>
<p>Additionally, we analyzed empirical data from a data base on autistic children using the three approaches and chose the optimal number of subgroups based on different methods. Similar to the simulation results, we found that the numbers of subgroups determined by <italic>k</italic>-means and <italic>k</italic>-medians were the same and the ARI was high. On the other hand, the number of subgroups and partitions determined by LCA coupled with BIC were different from those of LCA with AIC and the two non-model-based clustering approaches. Such inconsistency between clustering solutions of different approaches was also observed in (<xref ref-type="bibr" rid="B14">Brusco et al. 2017</xref>).</p>
<p>Based on the results of this study, we recommend psychological and educational practitioners consider LCA for clustering tasks when the items in the measurement scales are polytomous. Additionally, in practice, while applying <italic>k</italic>-means and <italic>k</italic>-medians, researchers may experience the issue of <italic>local optima</italic>, which refers to the tendency that the algorithm converges at a local rather than the global optimum. To address this issue, we suggest adopting the widely known <italic>multiple restart</italic> strategy (e.g., <xref ref-type="bibr" rid="B43">Shireman et al., 2016</xref>), where the algorithm is fit multiple times with different starting values/configurations. Further, to better understand the heterogeneity in the population and interpret the subgroup structure, we suggest determining the optimal number of subgroups based on both the statistical evidence and substantive theories. This is because, the global optimum may be achieved at a large number of subgroups, which can be less useful with regard to interpretation.</p>
<sec>
<label>5.1</label>
<title>Limitations</title>
<p>As with other simulations, one of the limitations of the present study concerns the manipulated factors and the chosen levels of these manipulated factors. For instance, the sample sizes <italic>N</italic> we considered in this simulation (250, 500, and 1,000) are generally viewed as moderately large and large sample sizes in psychological research. Therefore, our results may not be generalizable to cases where the sample sizes are relatively small. We also designed only one subgroup structure for each combination of the number of true variables <italic>V</italic> and the number of subgroups <italic>K</italic>. In reality, the distances between subgroups tend to be different. In addition, we simulated responses to items with three categories, while many psychological assessments and education surveys consists of items with four or more response categories. Therefore, for future research, we suggest more investigations on the impacts of true subgroup structures and measurement scale properties on the performance of clustering approaches.</p>
<p>Second, while maintaining consistency with prior studies, the process we followed to generate data is prone to several issues. For instance, we introduced randomness in Steps 2 and 3 of the process, but duplicates may exist and result in numerical instability of the optimization algorithms. We advocate for more discussion in the psychological and educational measurement community on how to generate data that can better reflect the reality. In addition, when different approaches are to be compared via a simulation, the data generation process should be carefully designed so that the simulated data do not favor any of the approaches.</p>
<p>Last but not least, some other popular clustering approaches were not considered due to the scope of this study. These approaches include but are not limited to HCA, spectral clustering (e.g., <xref ref-type="bibr" rid="B34">Ng et al., 2002</xref>; <xref ref-type="bibr" rid="B38">Rohe et al., 2011</xref>), and autoencoders for clustering (e.g., <xref ref-type="bibr" rid="B26">Klingler et al., 2017</xref>; <xref ref-type="bibr" rid="B50">Zhang et al., 2022</xref>). In addition, to better align with existing studies (e.g., <xref ref-type="bibr" rid="B14">Brusco et al., 2017</xref>), we did not include all possible methods for determining the optimal number of subgroups for the three clustering approaches in our simulation and empirical data analysis, such as SABIC and the Gap statistic.</p></sec>
<sec>
<label>5.2</label>
<title>Future directions</title>
<p>To enhance the understanding of LCA, <italic>k</italic>-means and <italic>k</italic>-medians when applied to psychological and educational research, we suggest expanding the scope of the simulation to account for various item types, true subgroup structures, and sample sizes. Future studies can also include factors related to psychometric properties of measurement scales, such as their reliabilities and item properties, in the simulation.</p>
<p>Secondly, to provide practitioners with a more comprehensive picture, we recommend comparing the three approaches considered in this study with other approaches for clustering tasks and other methods for choosing the subgroup numbers. Moreover, characteristics of the approaches, such as available software programs and the computation time, should be discussed in addition to their performance in recovering the subgroup partition. We also suggest that practitioners compare the solutions of different approaches (e.g., numbers of subgroups and assignment of subgroup memberships) when completing clustering tasks and choose the more interpretable results.</p></sec></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: Simon Foundation Powering Autism Research for Knowledge.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>SH: Conceptualization, Formal analysis, Methodology, Visualization, Writing &#x02013; original draft. PB: Formal analysis, Methodology, Visualization, Writing &#x02013; original draft. AS: Writing &#x02013; review &#x00026; editing.</p>
</sec>
<ack><title>Acknowledgments</title><p>The authors acknowledge the individuals and their families who participated in the SPARK (Simons Foundation Powering Autism Research for Knowledge) study for their contribution to research. Data analysis was performed using resources from the SPARK consortium.</p></ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/feduc.2025.1645911/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/feduc.2025.1645911/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Achenbach</surname> <given-names>T. M.</given-names></name></person-group> (<year>1999</year>). <article-title>&#x0201C;The Child Behavior Checklist and related instruments,&#x0201D;</article-title> in <source>The use of psychological testing for treatment planning and outcomes assessment</source>, ed. M. E. Maurish (Lawrence Erlbaum Associates Publishers, 2nd edition), <fpage>429</fpage>&#x02013;<lpage>466</lpage>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Agasisti</surname> <given-names>T.</given-names></name> <name><surname>Bowers</surname> <given-names>A. J.</given-names></name> <name><surname>Soncin</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>School principals leadership types and student achievement in the Italian context: empirical results from a three-step latent class analysis</article-title>. <source>Educ. Manag. Admin. Leader</source>. <volume>47</volume>, <fpage>860</fpage>&#x02013;<lpage>886</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1741143218768577</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Akaike</surname> <given-names>H.</given-names></name></person-group> (<year>1973</year>). <article-title>&#x0201C;Information theory and an extension of the maximum likelihood principle,&#x0201D;</article-title> in <source>Proceedings of the Second International Symposium on Information Theory</source>, eds. B. N. Petrov, and F. Csaki (Budapest: Akademiai Kiado), <fpage>267</fpage>&#x02013;<lpage>281</lpage>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Althoff</surname> <given-names>R. R.</given-names></name> <name><surname>Copeland</surname> <given-names>W. E.</given-names></name> <name><surname>Stanger</surname> <given-names>C.</given-names></name> <name><surname>Derks</surname> <given-names>E. M.</given-names></name> <name><surname>Todd</surname> <given-names>R. D.</given-names></name> <name><surname>Neuman</surname> <given-names>R. J.</given-names></name> <etal/></person-group>. (<year>2006</year>). <article-title>The latent class structure of ADHD is stable across informants</article-title>. <source>Twin Res. Hum. Genet</source>. <volume>9</volume>, <fpage>507</fpage>&#x02013;<lpage>522</lpage>. doi: <pub-id pub-id-type="doi">10.1375/twin.9.4.507</pub-id><pub-id pub-id-type="pmid">16899158</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Angela</surname> <given-names>U.</given-names></name> <name><surname>Alex</surname> <given-names>J. B.</given-names></name></person-group> (<year>2014</year>). <article-title>What are the different types of principals across the United States? A latent class analysis of principal perception of leadership</article-title>. <source>Educ. Admin. Quart</source>. <volume>50</volume>, <fpage>96</fpage>&#x02013;<lpage>134</lpage>. doi: <pub-id pub-id-type="doi">10.1177/0013161X13489019</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Araka</surname> <given-names>E.</given-names></name> <name><surname>Oboko</surname> <given-names>R.</given-names></name> <name><surname>Maina</surname> <given-names>E.</given-names></name> <name><surname>Gitonga</surname> <given-names>R.</given-names></name></person-group> (<year>2022</year>). <article-title>Using educational data mining techniques to identify profiles in self-regulated learning: an empirical evaluation</article-title>. <source>Int. Rev. Res. Open Distr. Learn</source>. <volume>23</volume>, <fpage>131</fpage>&#x02013;<lpage>162</lpage>. doi: <pub-id pub-id-type="doi">10.19173/irrodl.v22i4.5401</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Azzalini</surname> <given-names>A.</given-names></name> <name><surname>Menardi</surname> <given-names>G.</given-names></name></person-group> (<year>2014</year>). <article-title>Clustering via nonparametric density estimation: The r package pdfcluster</article-title>. <source>J. Stat. Softw</source>. <volume>57</volume>, <fpage>1</fpage>&#x02013;<lpage>26</lpage>. doi: <pub-id pub-id-type="doi">10.18637/jss.v057.i11</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barnard-Brak</surname> <given-names>L.</given-names></name> <name><surname>Paton</surname> <given-names>V. O.</given-names></name> <name><surname>Lan</surname> <given-names>W. Y.</given-names></name></person-group> (<year>2010</year>). <article-title>Profiles in self-regulated learning in the online learning environment</article-title>. <source>Int. Rev. Res. Open Distr. Learn</source>. <volume>11</volume>, <fpage>61</fpage>&#x02013;<lpage>80</lpage>. doi: <pub-id pub-id-type="doi">10.19173/irrodl.v11i1.769</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Basten</surname> <given-names>M. M.</given-names></name> <name><surname>Althoff</surname> <given-names>R. R.</given-names></name> <name><surname>Tiemeier</surname> <given-names>H.</given-names></name> <name><surname>Jaddoe</surname> <given-names>V. W.</given-names></name> <name><surname>Hofman</surname> <given-names>A.</given-names></name> <name><surname>Hudziak</surname> <given-names>J. J.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>The dysregulation profile in young children: empirically defined classes in the Generation R study</article-title>. <source>J. Am. Acad. Child Adoles. Psychiat</source>. <volume>52</volume>, <fpage>841</fpage>&#x02013;<lpage>850</lpage>.e2. doi: <pub-id pub-id-type="doi">10.1016/j.jaac.2013.05.007</pub-id><pub-id pub-id-type="pmid">23880494</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bishop</surname> <given-names>C. M.</given-names></name> <name><surname>Nasrabadi</surname> <given-names>N. M.</given-names></name></person-group> (<year>2006</year>). <source>Pattern Recognition and Machine Learning</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Broadbent</surname> <given-names>J.</given-names></name> <name><surname>Fuller-Tyszkiewicz</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>Profiles in self-regulated learning and their correlates for online and blended learning students</article-title>. <source>Educ. Technol. Res. Dev</source>. <volume>66</volume>, <fpage>1435</fpage>&#x02013;<lpage>1455</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11423-018-9595-9</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brusco</surname> <given-names>M. J.</given-names></name></person-group> (<year>2004</year>). <article-title>Clustering binary data in the presence of masking variables</article-title>. <source>Psychol. Methods</source> <volume>9</volume>, <fpage>510</fpage>&#x02013;<lpage>523</lpage>. doi: <pub-id pub-id-type="doi">10.1037/1082-989X.9.4.510</pub-id><pub-id pub-id-type="pmid">15598102</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brusco</surname> <given-names>M. J.</given-names></name> <name><surname>Cradit</surname> <given-names>J. D.</given-names></name></person-group> (<year>2001</year>). <article-title>A variable-selection heuristic for k-means clustering</article-title>. <source>Psychometrika</source> <volume>66</volume>, <fpage>249</fpage>&#x02013;<lpage>270</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF02294838</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brusco</surname> <given-names>M. J.</given-names></name> <name><surname>Shireman</surname> <given-names>E.</given-names></name> <name><surname>Steinley</surname> <given-names>D.</given-names></name></person-group> (<year>2017</year>). <article-title>A comparison of latent class, <italic>k</italic>-means, and <italic>k</italic>-median methods for clustering dichotomous data</article-title>. <source>Psychol. Methods</source> <volume>22</volume>, <fpage>563</fpage>&#x02013;<lpage>580</lpage>. doi: <pub-id pub-id-type="doi">10.1037/met0000095</pub-id><pub-id pub-id-type="pmid">27607543</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cali&#x00144;ski</surname> <given-names>T.</given-names></name> <name><surname>Harabasz</surname> <given-names>J.</given-names></name></person-group> (<year>1974</year>). <article-title>A dendrite method for cluster analysis</article-title>. <source>Commun. Statist</source>. <volume>3</volume>, <fpage>1</fpage>&#x02013;<lpage>27</lpage>. doi: <pub-id pub-id-type="doi">10.1080/03610917408548446</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Carmone</surname> <given-names>F. J. J.</given-names></name> <name><surname>Kara</surname> <given-names>A.</given-names></name> <name><surname>Maxwell</surname> <given-names>S.</given-names></name></person-group> (<year>1999</year>). <article-title>Hinov: a new model to improve market segment definition by identifying noisy variables</article-title>. <source>J. Market. Res</source>. <volume>36</volume>, <fpage>501</fpage>&#x02013;<lpage>509</lpage>. doi: <pub-id pub-id-type="doi">10.1177/002224379903600408</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dimitriadou</surname> <given-names>E.</given-names></name> <name><surname>Dolni&#x0010D;ar</surname> <given-names>S.</given-names></name> <name><surname>Weingessel</surname> <given-names>A.</given-names></name></person-group> (<year>2002</year>). <article-title>An examination of indexes for determining the number of clusters in binary data sets</article-title>. <source>Psychometrika</source> <volume>67</volume>, <fpage>137</fpage>&#x02013;<lpage>159</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF02294713</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Drew</surname> <given-names>A. L.</given-names></name> <name><surname>Jeffrey</surname> <given-names>B. L.</given-names></name></person-group> (<year>2011</year>). <article-title>poLCA: an R package for polytomous variable latent class analysis</article-title>. <source>J. Stat. Softw</source>. <volume>42</volume>, <fpage>1</fpage>&#x02013;<lpage>29</lpage>. doi: <pub-id pub-id-type="doi">10.18637/jss.v042.i10</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fowlkes</surname> <given-names>E. B.</given-names></name> <name><surname>Mallows</surname> <given-names>C. L.</given-names></name></person-group> (<year>1983</year>). <article-title>A method for comparing two hierarchical clusterings</article-title>. <source>J. Am. Stat. Assoc</source>. <volume>78</volume>, <fpage>553</fpage>&#x02013;<lpage>569</lpage>. doi: <pub-id pub-id-type="doi">10.1080/01621459.1983.10478008</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Godichon-Baggioni</surname> <given-names>A.</given-names></name></person-group> (<year>2022</year>). <article-title>Kmedians: K-Medians</article-title>. <source>R package version 2.2.0</source>. doi: <pub-id pub-id-type="doi">10.32614/CRAN.package.Kmedians</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goodman</surname> <given-names>L. A.</given-names></name></person-group> (<year>1974</year>). <article-title>Exploratory latent structure analysis using both identifiable and unidentifiable models</article-title>. <source>Biometrika</source> <volume>61</volume>, <fpage>215</fpage>&#x02013;<lpage>231</lpage>. doi: <pub-id pub-id-type="doi">10.1093/biomet/61.2.215</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Haslbeck</surname> <given-names>J. M.</given-names></name> <name><surname>Vermunt</surname> <given-names>J. K.</given-names></name> <name><surname>Waldorp</surname> <given-names>L. J.</given-names></name></person-group> (<year>2023</year>). <article-title>The impact of ordinal scales on Gaussian mixture recovery</article-title>. <source>Behav. Res. Methods.</source> <volume>55</volume>, <fpage>2143</fpage>&#x02013;<lpage>2156</lpage>. doi: <pub-id pub-id-type="doi">10.3758/s13428-022-01883-8</pub-id><pub-id pub-id-type="pmid">35831565</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hubert</surname> <given-names>L.</given-names></name></person-group> (<year>1974</year>). <article-title>Problems of seriation using a subject by item response matrix</article-title>. <source>Psychol. Bull</source>. <volume>81</volume>, <fpage>976</fpage>&#x02013;<lpage>983</lpage>. doi: <pub-id pub-id-type="doi">10.1037/h0037348</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Johnson</surname> <given-names>S. C.</given-names></name></person-group> (<year>1967</year>). <article-title>Hierarchical clustering schemes</article-title>. <source>Psychometrika</source> <volume>32</volume>, <fpage>241</fpage>&#x02013;<lpage>254</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF02289588</pub-id><pub-id pub-id-type="pmid">5234703</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kaufman</surname> <given-names>L.</given-names></name> <name><surname>Rousseeuw</surname> <given-names>P. J.</given-names></name></person-group> (<year>1990</year>). <source>Finding Groups in Data: An Introduction to Cluster Analysis</source>. New York: John Wiley &#x00026;Sons. doi: <pub-id pub-id-type="doi">10.1002/9780470316801</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Klingler</surname> <given-names>S.</given-names></name> <name><surname>Wampfler</surname> <given-names>R.</given-names></name> <name><surname>K&#x000E4;ser</surname> <given-names>T.</given-names></name> <name><surname>Solenthaler</surname> <given-names>B.</given-names></name> <name><surname>Gross</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Efficient feature embeddings for student classification with variational auto-cncoders,&#x0201D;</article-title> in <source>International Educational Data Mining Society, Paper presented at the International Conference on Educational Data Mining (EDM)</source>.</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Lazarsfeld</surname> <given-names>P. F.</given-names></name></person-group> (<year>1950a</year>). <article-title>&#x0201C;The interpretation and mathematical foundation of latent structure analysis,&#x0201D;</article-title> in <source>Measurement and Prediction</source> (<publisher-loc>Princeton University Press</publisher-loc>), <fpage>413</fpage>&#x02013;<lpage>472</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Lazarsfeld</surname> <given-names>P. F.</given-names></name></person-group> (<year>1950b</year>). <article-title>&#x0201C;The logical and mathematical foundation of latent structure analysis,&#x0201D;</article-title> in <source>Measurement and Prediction</source> (<publisher-loc>Princeton University Press</publisher-loc>), <fpage>362</fpage>&#x02013;<lpage>412</lpage>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>MacQueen</surname> <given-names>J.</given-names></name></person-group> (<year>1967</year>). <article-title>&#x0201C;Some methods for classification and analysis of multivariate observations,&#x0201D;</article-title> in <source>Proceedings of 5-th Berkeley Symposium on Mathematical Statistics and Probability</source> (<publisher-loc>University of California Press</publisher-loc>), <fpage>281</fpage>&#x02013;<lpage>297</lpage>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Maechler</surname> <given-names>M.</given-names></name> <name><surname>Rousseeuw</surname> <given-names>P.</given-names></name> <name><surname>Struyf</surname> <given-names>A.</given-names></name> <name><surname>Hubert</surname> <given-names>M.</given-names></name> <name><surname>Hornik</surname> <given-names>K.</given-names></name></person-group> (<year>2013</year>). <source>cluster: Cluster Analysis Basics and Extensions</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://svn.r-project.org/R-packages/trunk/cluster/">https://svn.r-project.org/R-packages/trunk/cluster/</ext-link></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Magidson</surname> <given-names>J.</given-names></name> <name><surname>Vermunt</surname> <given-names>J.</given-names></name></person-group> (<year>2002</year>). <article-title>Latent class models for clustering: a comparison with k-means</article-title>. <source>Canad. J. Market. Res</source>. <volume>20</volume>, <fpage>36</fpage>&#x02013;<lpage>43</lpage>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McCutcheon</surname> <given-names>A. L.</given-names></name></person-group> (<year>1987</year>). <source>Latent Class Analysis</source>. London: Sage. doi: <pub-id pub-id-type="doi">10.4135/9781412984713</pub-id></mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McLachlan</surname> <given-names>G.</given-names></name> <name><surname>Peel</surname> <given-names>D.</given-names></name></person-group> (<year>2000</year>). <source>Finite Mixture Models</source>. London: Wiley-Interscience Publication. doi: <pub-id pub-id-type="doi">10.1002/0471721182</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ng</surname> <given-names>A. Y.</given-names></name> <name><surname>Jordan</surname> <given-names>M. I.</given-names></name> <name><surname>Weiss</surname> <given-names>Y.</given-names></name></person-group> (<year>2002</year>). <article-title>&#x0201C;On spectral clustering: analysis and an algorithm,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, <fpage>849</fpage>&#x02013;<lpage>856</lpage>.</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nozadi</surname> <given-names>S. S.</given-names></name> <name><surname>Troller-Renfree</surname> <given-names>S.</given-names></name> <name><surname>White</surname> <given-names>L. K.</given-names></name> <name><surname>Frenkel</surname> <given-names>T.</given-names></name> <name><surname>Degnan</surname> <given-names>K. A.</given-names></name> <name><surname>Bar-Haim</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>The moderating role of attention biases in understanding the link between behavioral inhibition and anxiety</article-title>. <source>J. Exp. Psychopathol</source>. <volume>7</volume>, <fpage>451</fpage>&#x02013;<lpage>465</lpage>. doi: <pub-id pub-id-type="doi">10.5127/jep.052515</pub-id><pub-id pub-id-type="pmid">30498566</pub-id></mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Papachristou</surname> <given-names>N.</given-names></name> <name><surname>Miaskowski</surname> <given-names>C.</given-names></name> <name><surname>Barnaghi</surname> <given-names>P.</given-names></name> <name><surname>Maguire</surname> <given-names>R.</given-names></name> <name><surname>Farajidavar</surname> <given-names>N.</given-names></name> <name><surname>Cooper</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>&#x0201C;Comparing machine learning clustering with latent class analysis on cancer symptoms&#x00027; data,&#x0201D;</article-title> in <source>2016 IEEE Healthcare Innovation Point-Of-Care Technologies Conference (HI-POCT)</source>, 162&#x02013;166. doi: <pub-id pub-id-type="doi">10.1109/HIC.2016.7797722</pub-id></mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="book"><person-group person-group-type="author"><collab>R Core Team</collab></person-group> (<year>2023</year>). <source>R: A Language and Environment for Statistical Computing</source>. <publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>R Foundation for Statistical Computing</publisher-name>.</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rohe</surname> <given-names>K.</given-names></name> <name><surname>Chatterjee</surname> <given-names>S.</given-names></name> <name><surname>Yu</surname> <given-names>B.</given-names></name></person-group> (<year>2011</year>). <article-title>Spectral clustering and the high-dimensional stochastic blockmodel</article-title>. <source>Ann. Statist</source>. <volume>39</volume>, <fpage>1878</fpage>&#x02013;<lpage>1915</lpage>. doi: <pub-id pub-id-type="doi">10.1214/11-AOS887</pub-id></mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rousseeuw</surname> <given-names>P. J.</given-names></name></person-group> (<year>1987</year>). <article-title>Silhouettes: a graphical aid to the interpretation and validation of cluster analysis</article-title>. <source>J. Comput. Appl. Math</source>. <volume>20</volume>, <fpage>53</fpage>&#x02013;<lpage>65</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0377-0427(87)90125-7</pub-id></mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schreiber</surname> <given-names>J. B.</given-names></name> <name><surname>Pekarik</surname> <given-names>A. J.</given-names></name></person-group> (<year>2014</year>). <article-title>Using latent class analysis versus k-means or hierarchical clustering to understand museum visitors</article-title>. <source>Curator: The Museum Journal</source> <volume>57</volume>, <fpage>45</fpage>&#x02013;<lpage>59</lpage>. doi: <pub-id pub-id-type="doi">10.1111/cura.12050</pub-id></mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schwarz</surname> <given-names>G.</given-names></name></person-group> (<year>1978</year>). <article-title>Estimating the dimension of a model</article-title>. <source>Ann. Stat</source>. <volume>6</volume>, <fpage>461</fpage>&#x02013;<lpage>464</lpage>. doi: <pub-id pub-id-type="doi">10.1214/aos/1176344136</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sclove</surname> <given-names>S. L.</given-names></name></person-group> (<year>1987</year>). <article-title>Application of model-selection criteria to some problems in multivariate analysis</article-title>. <source>Psychometrika</source> <volume>52</volume>, <fpage>333</fpage>&#x02013;<lpage>343</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF02294360</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shireman</surname> <given-names>E. M.</given-names></name> <name><surname>Steinley</surname> <given-names>D.</given-names></name> <name><surname>Brusco</surname> <given-names>M. J.</given-names></name></person-group> (<year>2016</year>). <article-title>Local optima in mixture modeling</article-title>. <source>Multivariate Behav. Res</source>. <volume>51</volume>, <fpage>466</fpage>&#x02013;<lpage>481</lpage>. doi: <pub-id pub-id-type="doi">10.1080/00273171.2016.1160359</pub-id><pub-id pub-id-type="pmid">27494191</pub-id></mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><collab>SPARK Consortium</collab></person-group> (<year>2018</year>). <article-title>Spark: a us cohort of 50,000 families to accelerate autism research</article-title>. <source>Neuron</source> <volume>97</volume>, <fpage>488</fpage>&#x02013;<lpage>493</lpage>.</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Sparrow</surname> <given-names>S. S.</given-names></name> <name><surname>Cicchetti</surname> <given-names>D. V.</given-names></name> <name><surname>Saulnier</surname> <given-names>C. A.</given-names></name></person-group> (<year>2016</year>). <source>Vineland Adaptive Behavior Scales, Third Edition</source>. <publisher-loc>Pearson</publisher-loc>.</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Steinley</surname> <given-names>D.</given-names></name></person-group> (<year>2004</year>). <article-title>Properties of the hubert-arable adjusted rand index</article-title>. <source>Psychol. Methods</source> <volume>9</volume>, <fpage>386</fpage>&#x02013;<lpage>396</lpage>. doi: <pub-id pub-id-type="doi">10.1037/1082-989X.9.3.386</pub-id></mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tibshirani</surname> <given-names>R.</given-names></name> <name><surname>Walther</surname> <given-names>G.</given-names></name> <name><surname>Hastie</surname> <given-names>T.</given-names></name></person-group> (<year>2002</year>). <article-title>Estimating the number of clusters in a data set via the Gap statistic</article-title>. <source>J. R. Stat. Soc. Series B</source> <volume>63</volume>, <fpage>411</fpage>&#x02013;<lpage>423</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1467-9868.00293</pub-id></mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Vermunt</surname> <given-names>J. K.</given-names></name> <name><surname>Magidson</surname> <given-names>J.</given-names></name></person-group> (<year>2004</year>). <article-title>&#x0201C;Latent class analysis,&#x0201D;</article-title> in <source>The Sage Encyclopedia of Social Sciences Research Methods</source>, eds. M. S. Lewis-Beck, A. Bryman, and T. F. Liao (<publisher-loc>Thousand Oakes, CA</publisher-loc>: <publisher-name>Sage Publications</publisher-name>), <fpage>549</fpage>&#x02013;<lpage>553</lpage>.</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wadsworth</surname> <given-names>M. E.</given-names></name> <name><surname>Hudziak</surname> <given-names>J. J.</given-names></name> <name><surname>Heath</surname> <given-names>A. C.</given-names></name> <name><surname>Achenbach</surname> <given-names>T. M.</given-names></name></person-group> (<year>2001</year>). <article-title>Latent class analysis of child behavior checklist anxiety/depression in children and adolescents</article-title>. <source>J. Am. Acad. Child Adoles. Psychiat</source>. <volume>40</volume>, <fpage>106</fpage>&#x02013;<lpage>114</lpage>. doi: <pub-id pub-id-type="doi">10.1097/00004583-200101000-00023</pub-id><pub-id pub-id-type="pmid">11195551</pub-id></mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Du</surname> <given-names>X.</given-names></name> <name><surname>Rice</surname> <given-names>K.</given-names></name> <name><surname>Hung</surname> <given-names>J.-L.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>Revealing at-risk learning patterns and corresponding self-regulated strategies via LSTM encoder and time-series clustering</article-title>. <source>Inf. Disc. Deliv</source>. <volume>50</volume>, <fpage>206</fpage>&#x02013;<lpage>216</lpage>. doi: <pub-id pub-id-type="doi">10.1108/IDD-12-2020-0160</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/400257/overview">Kuan-Yu Jin</ext-link>, Hong Kong Examinations and Assessment Authority, Hong Kong SAR, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1481319/overview">Max Hahn-Klimroth</ext-link>, Goethe University Frankfurt, Germany</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2392023/overview">Rajesh Kumar</ext-link>, Kansas State University Olathe, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3190402/overview">Yi-Jhen Wu</ext-link>, TU Dortmund University, Germany</p>
</fn>
</fn-group>
</back>
</article>
