<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Psychol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Psychology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Psychol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-1078</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpsyg.2026.1654996</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Latent trait or sum score: addressing measurement challenges in the prediction of self-rated symptom outcomes in psychological treatment</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Hentati Isacsson</surname>
<given-names>Nils</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3094891"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Johansson</surname>
<given-names>Magnus</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kaldo</surname>
<given-names>Viktor</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1116602"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Clinical Neuroscience, Stockholm Health Care Services, Centre for Psychiatry Research, Karolinska Institutet</institution>, <city>Stockholm</city>, <country country="se">Sweden</country></aff>
<aff id="aff2"><label>2</label><institution>Division Built Environment, RISE Research Institutes of Sweden, System Transition</institution>, <city>Gothenburg</city>, <country country="se">Sweden</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Psychology, Faculty of Health and Life Sciences, Linnaeus University</institution>, <city>V&#x00E4;xj&#x00F6;</city>, <country country="se">Sweden</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Nils Hentati Isacsson, <email xlink:href="mailto:nils.isacsson@ki.se">nils.isacsson@ki.se</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-26">
<day>26</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1654996</elocation-id>
<history>
<date date-type="received">
<day>27</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>03</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Hentati Isacsson, Johansson and Kaldo.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Hentati Isacsson, Johansson and Kaldo</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-26">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Objective</title>
<p>Reliable and accurate measurement is fundamental to scientific progress; however, the dominant measurement practices in psychology, clinical psychology, and prediction research often lack rigor. Improving measures using Rasch Measurement Theory (RMT) offers advantages by fulfilling the key psychometric properties of unidimensionality, local independence of items, ordering of response categories, and invariance. Ordinal-level sum scores can be transformed into interval-level latent trait scores, thereby improving the measurement precision. However, the impact of using psychometrically advanced questionnaires with latent trait scores, as opposed to traditional sum scores, in predictive models is still unclear. This study evaluates whether using latent trait scores as predictors and outcomes, in accordance with RMT, improves predictive performance compared to using traditional sum scores when predicting treatment outcomes during psychological treatment.</p>
</sec>
<sec>
<title>Methods</title>
<p>Self-rated symptom data from three different questionnaires, collected over the first 4&#x202F;weeks of psychological treatment from 6,464 patients undergoing a 12-week treatment program, were used to predict post-treatment outcomes on the same questionnaires. This was done in two ways: (1) using sum scores as the questionnaires were originally developed and (2) using a reformulated, more psychometrically robust version of the questionnaires based on Rasch analysis, which was also shorter. The prediction models used were linear regression, Bayesian ridge regression, and random forest. Multiple imputations were used to address missing data, and nested cross-validation was employed for hyperparameter tuning and scoring.</p>
</sec>
<sec>
<title>Results</title>
<p>Latent scores calculated using the psychometrically optimized shorter version, which comprises 23% of the full scale, showed similar predictive performance compared to the sum score of the full scale. Overall, there was a statistically significant but practically negligible difference of 0.007&#x2013;0.008 in the root mean squared error (RMSE) when comparing the original sum score to the latent trait scores.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>Initial findings comparing psychometrically improved questionnaires with the original ordinal sum scores within a predictive framework indicate that using latent trait scores derived from these improvements showed the predictive performance similar to the sum score of the full scale. The small differences suggest that the improved versions remain valuable owing to their enhanced psychometric qualities and the reduction in response burden by using considerably fewer items. Further research is needed to explore the use of latent trait scores compared to ordinal sum scores in predictive research.</p>
</sec>
</abstract>
<kwd-group>
<kwd>digital mental health</kwd>
<kwd>ICBT</kwd>
<kwd>latent trait</kwd>
<kwd>machine learning</kwd>
<kwd>prediction</kwd>
<kwd>Rasch Measurement</kwd>
<kwd>treatment outcome</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication. This work was mainly supported by The Swedish Research Council (VR), The Erling Persson family foundation (EP-Stiftelsen), and The Swedish ALF-agreement between the Swedish government and the county councils, with additional funding by the Swedish Foundation for Strategic Research (SSF).</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="1"/>
<equation-count count="1"/>
<ref-count count="55"/>
<page-count count="9"/>
<word-count count="7820"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Quantitative Psychology and Measurement</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<title>Introduction</title>
<p>Reliable and accurate measurement is the cornerstone of scientific progress. The ability to define and, with validity, quantify phenomena consistently underpins the development of theories, the testing of hypotheses (<xref ref-type="bibr" rid="ref38">Michell, 1997</xref>), and the application of findings to real-world challenges (<xref ref-type="bibr" rid="ref41">Pendrill, 2018</xref>). Although not widely discussed in the field of psychological research, there has been a long-standing critique of the dominant measurement practices (<xref ref-type="bibr" rid="ref38">Michell, 1997</xref>) with increased attention in recent years (<xref ref-type="bibr" rid="ref11">Elson et al., 2023</xref>; <xref ref-type="bibr" rid="ref14">Flake and Fried, 2020</xref>; <xref ref-type="bibr" rid="ref27">Johansson et al., 2023</xref>).</p>
<p>Current practices often ignore issues regarding measurement (<xref ref-type="bibr" rid="ref13">Flake et al., 2022</xref>), as exemplified by <xref ref-type="bibr" rid="ref30">Lilienfeld and Strother (2020)</xref>, seldom motivate the validity of instruments, and often rely on psychometric evaluations based on small samples (<xref ref-type="bibr" rid="ref11">Elson et al., 2023</xref>). Furthermore, psychometric evaluations often rely on sum scores (<xref ref-type="bibr" rid="ref37">McNeish and Wolf, 2020b</xref>) using Cronbach&#x2019;s alpha (<xref ref-type="bibr" rid="ref33">McNeish, 2018</xref>) to assess scale properties, an approach that has faced substantial methodological critique (<xref ref-type="bibr" rid="ref34">McNeish, 2022</xref>). There is no widespread consensus on how to assess the psychometric quality; however, four key psychometric properties have been suggested as a minimal framework for guiding psychometric analyses: unidimensionality, local independence of items, ordering of response categories, and invariance (<xref ref-type="bibr" rid="ref7">Christensen et al., 2013</xref>; <xref ref-type="bibr" rid="ref27">Johansson et al., 2023</xref>; <xref ref-type="bibr" rid="ref28">Kreiner, 2007</xref>).</p>
<p>Although these properties can be evaluated through various methods, Rasch Measurement Theory (RMT) offers the distinct advantage of treating the ordinal sum score as a sufficient statistic for measurement (<xref ref-type="bibr" rid="ref1">Andrich and Marais, 2019</xref>). Rasch analysis models the probability of a response to an item based on a person&#x2019;s <italic>ability</italic> or <italic>trait (&#x03B2;)</italic> and an item&#x2019;s <italic>difficulty</italic> (<inline-formula>
<mml:math id="M1">
<mml:mi>&#x03B4;</mml:mi>
</mml:math>
</inline-formula>), and the formulation of the model suggests that these two parameters can be separated, known as &#x201C;specific objectivity.&#x201D; <xref ref-type="fig" rid="fig1">Figure 1</xref> shows the modeled probability of a correct response to a dichotomous item (<xref ref-type="disp-formula" rid="E1">Equation 1</xref>) with difficulty <inline-formula>
<mml:math id="M2">
<mml:mi>&#x03B4;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
</mml:math>
</inline-formula>depending on the varying level of a person&#x2019;s latent trait <inline-formula>
<mml:math id="M3">
<mml:mi>&#x03B2;</mml:mi>
</mml:math>
</inline-formula>:</p>
<disp-formula id="E1">
<mml:math id="M4">
<mml:mi>P</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext mathvariant="italic">Correct response</mml:mtext>
<mml:mspace width="0.1em"/>
<mml:mo>&#x2223;</mml:mo>
<mml:mi>&#x03B2;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x03B4;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>&#x03B2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03B4;</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>&#x03B2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03B4;</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(1)</label>
</disp-formula>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Probability of a correct response to a dichotomous item as a function of persons of varying proficiency. The latent trait <inline-formula>
<mml:math id="M5">
<mml:mi>&#x03B2;</mml:mi>
</mml:math>
</inline-formula> represents the latent ability of a person and varies between &#x2212;5 and +5 on the logit scale in this figure. <inline-formula>
<mml:math id="M6">
<mml:mi>&#x03B4;</mml:mi>
</mml:math>
</inline-formula> is the item difficulty, set to 0 in this figure.</p>
</caption>
<graphic xlink:href="fpsyg-17-1654996-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Logistic curve graph depicting the probability of correct response against latent trait, with the inflection point at zero labeled as beta equals delta and regions marked beta less than delta and beta greater than delta.</alt-text>
</graphic>
</fig>
<p><xref ref-type="disp-formula" rid="E1">Equation 1</xref> is simplified in terms of notation and subscripts, as shown by <xref ref-type="bibr" rid="ref1">Andrich and Marais (2019)</xref>. <xref ref-type="fig" rid="fig1">Figure 1</xref> illustrates that as a person&#x2019;s trait increases, the probability of a correct response also increases. When the item difficulty (set to 0 in the figure) equals a person&#x2019;s latent score <inline-formula>
<mml:math id="M7">
<mml:mi>&#x03B2;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>&#x03B4;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula>, the probability of a correct answer is 50% (see dashed lines in <xref ref-type="fig" rid="fig1">Figure 1</xref>).</p>
<p>As a person&#x2019;s latent trait increases further from the item difficulty, <inline-formula>
<mml:math id="M8">
<mml:mi>&#x03B2;</mml:mi>
<mml:mo>&#x003E;</mml:mo>
<mml:mi>&#x03B4;</mml:mi>
</mml:math>
</inline-formula>, the probability of a correct answer increases toward 100%, and vice versa. Latent traits and item difficulties are both expressed in logit units (log-odds) with an arbitrary center or reference point (such as 0). In clinical psychology, a typical situation exemplified by <xref ref-type="fig" rid="fig1">Figure 1</xref> could be a dichotomous item that indicates the presence of suicidal ideation. This question could be answered as yes or no with a numerical representation of 1 or 0, respectively. Such an item could be included in a questionnaire to measure the trait of depression; as this inferred trait increases (a more depressed patient), the probability of endorsing &#x201C;yes&#x201D; to suicidal ideation also increases. In short, the difficulty or <inline-formula>
<mml:math id="M9">
<mml:mi>&#x03B4;</mml:mi>
</mml:math>
</inline-formula> of such an item would indicate that those with more of the latent trait of depression would indicate the presence (answer yes) of suicidal ideation. Specific objectivity essentially means that differences between items&#x2019; difficulties (<inline-formula>
<mml:math id="M10">
<mml:mi>&#x03B4;</mml:mi>
</mml:math>
</inline-formula>) can be assessed independently of the current sample of respondents providing answers. Similarly, differences between persons (<inline-formula>
<mml:math id="M11">
<mml:mi>&#x03B2;</mml:mi>
</mml:math>
</inline-formula>) can be assessed independently of the difficulties of the items. When a set of items fulfills the criteria previously listed, Rasch analysis allows the transformation of an ordinal raw sum score into an interval-level latent trait score (latent score) for each person, with specific measurement error at each level of the scale. This represents an individual&#x2019;s latent trait regarding what is being measured. In this article, a trait or latent score is the inferred <italic>amount</italic> of what is being measured, not an inherent trait.</p>
<p>In prediction research using self-rated symptom measures, as in psychological research, limited attention has been paid to measurement practices. Clinical prediction models for psychological treatment have the potential to improve treatment outcomes (<xref ref-type="bibr" rid="ref2">Barkham et al., 2023</xref>; <xref ref-type="bibr" rid="ref15">Forsell et al., 2019</xref>; <xref ref-type="bibr" rid="ref20">Hentati Isacsson et al., 2024a</xref>). Therefore, improving these measures could have a significant impact on patients. There are several experiments investigating the impact of measurement error in prediction research involving self-rated symptoms. <xref ref-type="bibr" rid="ref24">Jacobucci and Grimm (2020)</xref> showed using simulations that a predictor&#x2019;s reliability can heavily influence the prediction performance (<xref ref-type="bibr" rid="ref24">Jacobucci and Grimm, 2020</xref>), and medical prediction models have been shown to be less valid as a function of increased measurement error in predictors (<xref ref-type="bibr" rid="ref31">Luijken et al., 2019</xref>). <xref ref-type="bibr" rid="ref32">McNamara et al. (2022)</xref> took this one step further and simulated both measurement error in predictors and varying degrees of outcome misspecification (<xref ref-type="bibr" rid="ref32">McNamara et al., 2022</xref>). The result was that the underlying non-linear relationship was not identified by non-linear models (e.g., random forest) or regression models, and non-linear models had indistinguishable performance. Furthermore, an attempt to attenuate measurement problems by increasing the sample size from 4,000 to 100,000 yielded negligible improvements in predictive performance. Thus, when the measurement error is high, the predictive performance plateaus at a low level compared to models using more reliable variables (<xref ref-type="bibr" rid="ref32">McNamara et al., 2022</xref>), indicating heavy influence of reliability for both predictors and outcomes. This does not consider the validity of measurement. As such, it is perhaps surprising that the recently updated TRIPOD+AI statement for research on clinical prediction models does not mention measurement practices in relation to predictors or outcomes, except that it should be noted &#x201C;&#x2026;how and when they (predictors/outcomes) were measured&#x2026;&#x201D; (<xref ref-type="bibr" rid="ref9">Collins et al., 2024</xref>). This is especially true in clinical psychology, where measurement error is an issue. Although using psychometrically sound measures is important and measurement reliability significantly affects predictions, there is limited empirical evidence that psychometrically refined measures offer the predictive advantages over summed ordinal scores based on a set of items that have not been subject to an adequate psychometric assessment. However, a few studies indicate that a measure validated using RMT can outperform a traditional measure in ROC/AUC classification performance, although these were for pregnancy tests and intensive care unit admissions (<xref ref-type="bibr" rid="ref12">Fisher and Burton, 2010</xref>; <xref ref-type="bibr" rid="ref42">Pendrill et al., 2023</xref>).</p>
<p>Recently, there has been an encompassing debate about the use of sum scores in psychological research (<xref ref-type="bibr" rid="ref35">McNeish, 2024</xref>; <xref ref-type="bibr" rid="ref37">McNeish and Wolf, 2020b</xref>; <xref ref-type="bibr" rid="ref47">Sijtsma et al., 2024</xref>; <xref ref-type="bibr" rid="ref54">Widaman and Revelle, 2023</xref>). In a simulation setting, for certain conditions, the sum score can correlate to a stronger degree with the true underlying latent score than the estimated latent score itself (<xref ref-type="bibr" rid="ref47">Sijtsma et al., 2024</xref>). Furthermore, <xref ref-type="bibr" rid="ref47">Sijtsma et al. (2024)</xref> showed using a simulation that the sum score can adequately represent the score from each item (similar to an estimated latent score). Thus, while the sum score can perform adequately both in a predictive context and for inference (<xref ref-type="bibr" rid="ref47">Sijtsma et al., 2024</xref>), its current use of the sum score is seldom motivated properly (<xref ref-type="bibr" rid="ref35">McNeish, 2024</xref>) in relation to psychometric properties such as dimensionality, validity, and invariance assessments. Therefore, the naive use of the sum score has a high risk for bias. However, when the sum score shows stochastic ordering, the latent score and sum can have similar performances in predicting an external variable (<xref ref-type="bibr" rid="ref47">Sijtsma et al., 2024</xref>). Stochastic ordering means that, as the sum score increases, the latent variable also increases when conditioned on the sum score. While sum scores have pragmatic uses, a more psychometrically sound latent score is expected to improve predictive performance based on previous simulation studies. At the same time, empirical research on measuring change in psychometrically evaluated standardized tests of preschool children shows that the latent and ordinal sum scores show marginal differences (<xref ref-type="bibr" rid="ref4">Bezruczko et al., 2016</xref>). Thus, the overall advantage of using latent scores over sum scores in an empirical predictive framework remains largely unexplored.</p>
<sec id="sec2">
<title>Objectives</title>
<p>The aim of this study is to evaluate and compare predictive models using traditional sum scores and latent scores based on a reformulated, more psychometrically sound version of the questionnaires. Specifically, we investigated whether using the latent scores as predictors and outcomes with these reformulated questionnaires increases the predictive performance of the models predicting treatment outcomes in psychological treatment.</p>
</sec>
</sec>
<sec sec-type="methods" id="sec3">
<title>Methods</title>
<p>This is a prospective prediction study using longitudinal observational data from a regular care clinic providing therapist-guided psychological treatment. Ethical approval was received from the regional ethical review board in Stockholm (Dnr: 2011/2091&#x2013;31/3, amendment 2016/21&#x2013;32, 2017/2320&#x2013;32, and 2018/2550&#x2013;32). The supplement contains the results data, code for analysis, and further details of the methods.</p>
<sec id="sec4">
<title>Setting and participants</title>
<p>The participants (<italic>n</italic>&#x202F;=&#x202F;6,464) were routine care patients at an Internet psychiatry clinic in Stockholm (<xref ref-type="bibr" rid="ref49">Titov et al., 2018</xref>). They received 12&#x202F;weeks of Internet-delivered Cognitive Behavioral Therapy (ICBT) for either major depressive disorder (<italic>n</italic>&#x202F;=&#x202F;2,988), panic disorder (<italic>n</italic>&#x202F;=&#x202F;1721), or social anxiety disorder (<italic>n</italic>&#x202F;=&#x202F;1755). The treatments were guided by a licensed clinical psychologist and showed positive results (<xref ref-type="bibr" rid="ref10">El Alaoui et al., 2015</xref>; <xref ref-type="bibr" rid="ref19">Hedman et al., 2013</xref>, <xref ref-type="bibr" rid="ref18">2014</xref>). Each treatment consisted of condition-specific CBT techniques and weekly self-assessments of the primary symptoms. The data from all three treatments were pooled into a single dataset because this is beneficial for developing prediction models (<xref ref-type="bibr" rid="ref20">Hentati Isacsson et al., 2024a</xref>; <xref ref-type="bibr" rid="ref55">Zantvoort et al., 2024</xref>). The predicted outcome was the last self-assessment of the primary symptoms that occurred at treatment completion (post-treatment).</p>
</sec>
<sec id="sec5">
<title>Symptom data</title>
<p>The questionnaires used to assess the symptoms of each treatment were the Montgomery-&#x00C5;sberg Depression Rating Scale-Self Report (MADRS-S) (<xref ref-type="bibr" rid="ref39">Montgomery and Asberg, 1979</xref>) for major depressive disorder, the Panic Disorder Symptom Scale-Self Report (PDSS-SR) (<xref ref-type="bibr" rid="ref23">Houck et al., 2002</xref>) for panic disorder, and the Leibowitz Social Anxiety Scale-Self Report version (LSAS-SR) (<xref ref-type="bibr" rid="ref16">Fresco et al., 2001</xref>) for social anxiety disorder. These assessments were conducted at screening, before the start of treatment; on a weekly basis during treatment; and post-treatment. The post-treatment time point was the predicted outcome. Furthermore, a min-max transformation based on the questionnaires&#x2019; minimum and maximum scores was applied to each intervention sample individually to enable the aggregation of all three treatments (<xref ref-type="bibr" rid="ref8">Cohen et al., 1999</xref>). The minimum ordinal sum score was 0 for all three questionnaires, and the maximum was 28, 54, and 144 for the PDSS-SR, MADRS-S, and LSAS-SR, respectively.</p>
</sec>
<sec id="sec6">
<title>Psychometric analyses</title>
<p>A prior analysis was conducted using a Rasch Measurement Theory (RMT) framework for all three questionnaires (MADRS-S, PDSS-SR, and LSAS-SR) separately (<xref ref-type="bibr" rid="ref21">Hentati Isacsson and Johansson, 2025</xref>). These analyses primarily used Rasch analysis to reformulate each questionnaire into a set of items with adequate measurement properties (<xref ref-type="bibr" rid="ref27">Johansson et al., 2023</xref>; <xref ref-type="bibr" rid="ref28">Kreiner, 2007</xref>). Consequently, all questionnaires were shortened. Measurement properties were assessed in an iterative analysis process and focused on scale unidimensionality, local independence of items, ordering of response categories, and invariance, which resulted in the elimination of several items from the original scales. The reformulated MADRS-S consists of three items (originally 9), the PDSS-SR of four items (originally 7), and the LSAS-SR of eight items (originally 48). Items were primarily removed owing to issues with either multidimensionality or local dependence. Thus, 23% of all the original questions were retained (see the <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref> for the items retained and <xref ref-type="bibr" rid="ref21">Hentati Isacsson and Johansson, 2025</xref> for item details). The reformulated scale and item parameters were used to estimate the transformation of raw ordinal sum scores to interval-level latent scores. The optimization process for estimating latent score used weighted likelihood (<xref ref-type="bibr" rid="ref53">Warm, 1989</xref>; see <xref ref-type="bibr" rid="ref21">Hentati Isacsson and Johansson, 2025</xref> for complete details). A Confirmatory Factor Analysis (CFA) is reported in the results to exemplify the improved psychometric properties of items in the reformulated &#x201C;Rasch&#x201D; versions of the original scales. The questionnaire data were obtained from the pretreatment assessment timepoint. This analysis proposed one unidimensional underlying factor implemented with lavaan (<xref ref-type="bibr" rid="ref45">Rosseel, 2012</xref>) using oblimin rotation and the Weighted Least Squares with Mean and Variance adjustment (WLSMV) estimator. To evaluate model fit, we used scaled fit metrics and dynamic cutoffs (<xref ref-type="bibr" rid="ref36">McNeish and Wolf, 2020a</xref>).</p>
</sec>
<sec id="sec7">
<title>Latent score</title>
<p>The latent scores used were based on the previous RMT reformulation of the questionnaires (<xref ref-type="bibr" rid="ref21">Hentati Isacsson and Johansson, 2025</xref>). In the predictions, the latent scores and their corresponding standard errors were used instead of the ordinal sum score from the original symptom data. As with the original symptom data, these latent scores and their standard errors were rescaled using a min-max transformation based on latent score tables, making the scale logits range from 0 to 1. Note that while the scores were rescaled, they were not standardized. See <xref ref-type="fig" rid="fig2">Figure 2</xref> for a scatterplot between the original scales&#x2019; sum score and the latent scores of the Rasch reformulated scales for post-treatment time points (outcome). The data are divided based on treatments due to the clinical sample, as each questionnaire is specific to the corresponding treatment.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Scatterplot of sum scores and latent scores for the post-treatment self-rating. Scatterplot shows sum scores on the y-axis from the original questionnaires and latent scores of reformulated questionnaires on the x-axis. Marginal y-axis shows the histogram over the original sum scores, and the marginal x-axis shows the histogram over the latent scores from the reformulated questionnaires. A smooth curve was fitted for each treatment. Questionnaires were: Montgomery-&#x00C5;sberg Depression Rating Scale Self-report for MDD, Panic Disorder Symptom Scale-Self Report for PD, and Leibowitz Social Anxiety Scale-Self report for SAD. MDD, Major Depressive Disorder, PD; Panic Disorder, SAD; Social Anxiety Disorder. This analysis was based on complete data without imputation for the post-treatment assessment, which was the predicted outcome. Pearson&#x2019;s correlation was 0.92 for MDD, 0.95 for PD, and 0.90 for SAD.</p>
</caption>
<graphic xlink:href="fpsyg-17-1654996-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Scatter plot with overlaid regression lines and marginal histograms compares latent scores and original ordinal sum scores for three treatments: MDD (dark blue), PD (orange), and SAD (magenta). SAD demonstrates the steepest positive relationship, while PD&#x2019;s is the flattest. Marginal histograms at the top and right show score distributions by treatment.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec8">
<title>Datasets</title>
<p>Two different datasets were created. The &#x201C;Base&#x201D; dataset used only the sum score of the original symptom scales for both prediction and outcome. The &#x201C;Rasch&#x201D; dataset used the latent score and its standard error of the reformulated symptom scale for prediction and outcome. However, the standard error of the outcome was not used, as this was tied to the outcome and would have introduced data leakage. For each dataset, the weekly symptom variables (sum or latent score) were added as separate predictors. For the Rasch datasets, the interaction between the latent score and the standard error of the latent score was also added. Furthermore, for the Rasch dataset, a weight was added only for use in the weighted regression or random forest; therefore, it was not included as a standalone predictor in the non-weighted models. This was based on the inverse sum of all the standard errors of the latent scores across the assessment times. This attempted to incorporate the estimation error for the latent variable available from the RMT analysis. Both &#x201C;Base&#x201D; and &#x201C;Rasch&#x201D; datasets used only the data from the assessment times up to and including week 4 of treatment to predict the post-treatment symptom score. As such, the pretreatment severity of symptoms was also included. The cut-off at week 4 was used, as it has previously been shown to be a good balance between clinical usefulness and predictive value (<xref ref-type="bibr" rid="ref15">Forsell et al., 2019</xref>; <xref ref-type="bibr" rid="ref20">Hentati Isacsson et al., 2024a</xref>). Both datasets also contained indicator variables for treatment, variables about sex and age, and the year of treatment start. These minimal clinical variables have previously been found to be useful in a predictive framework. Further statistical details can be found in the study by <xref ref-type="bibr" rid="ref20">Hentati Isacsson et al. (2024a)</xref>. Furthermore, these variables were included to counteract possible confounding in the data due to the different treatments, drift in clinical expertise over the years, sex, and age.</p>
</sec>
<sec id="sec9">
<title>Prediction models</title>
<p>The following models were used for analyses: A dummy regressor (DR) only predicting the mean of the outcome, Linear regression (LR), Bayesian ridge regression (BR), and a Random Forest (RF) model. No longitudinal model was used because previous findings suggest that these models do not improve predictive capability compared to their non-longitudinal counterparts (<xref ref-type="bibr" rid="ref22">Hentati Isacsson et al., 2024b</xref>). These models were chosen based on <xref ref-type="bibr" rid="ref20">Hentati Isacsson et al. (2024a)</xref> and <xref ref-type="bibr" rid="ref22">Hentati Isacsson et al. (2024b)</xref>, who found no differential impact of predictive models on this prediction problem, and ergo these models were all found on par with other, more computationally complex models. Furthermore, because the main objective of this study was not to determine the differential impact of predictive models on this problem, we deemed a limited number of models to suffice. All models were implemented in Python 3.10.12 (<xref ref-type="bibr" rid="ref43">Python Software Foundation, 2023</xref>) using scikit-learn (<xref ref-type="bibr" rid="ref40">Pedregosa et al., 2011</xref>).</p>
<p>The DR predicts only the mean of the outcome and represents a model that is not trained at all (a null model). LR is considered the benchmark method representing the predictive capabilities of a simple model. Bayesian Ridge Regression is a more complex model that incorporates uncertainty and regularization (<xref ref-type="bibr" rid="ref48">Tipping, 2001</xref>). Finally, Random Forest is a machine learning model that combines multiple decision trees to improve predictive performance, reduce overfitting, and capture non-linear relationships (<xref ref-type="bibr" rid="ref5">Breiman, 2001</xref>).</p>
</sec>
<sec id="sec10">
<title>Hyperparameters</title>
<p>We tuned the following hyperparameters using a grid search inside the nested cross-validation loop. BR models considered alpha_[1,2], lambda_[1,2] of [1e-6, 1e-4, 1e-2], the RF models considered all variables with 100 or 300 estimators, a minimum sample split size of [2,5,10], and a minimum sample leaf size of [1,2,5,10].</p>
</sec>
<sec id="sec11">
<title>Imputation</title>
<p>We imputed the missing data before cross-validation (<xref ref-type="bibr" rid="ref25">Jaeger et al., 2020</xref>). Imputation was carried out in accordance with a multilevel imputation (<xref ref-type="bibr" rid="ref17">Grund et al., 2018</xref>) with 20 imputations for each type of dataset using MICE implemented in R (<xref ref-type="bibr" rid="ref51">van Buuren and Groothuis-Oudshoorn, 2011</xref>; <xref ref-type="bibr" rid="ref44">R Core Team, 2024</xref>). Imputation allows the estimation of our models to also model the variability due to missing data, and a complete case analysis could bias our results despite our sample size (<xref ref-type="bibr" rid="ref52">van Ginkel et al., 2020</xref>). Due to the online format of the self-rated data collection, no single items were missing, but entire questionnaires, and thus, the sum score was imputed (see <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref> for the number of missing data points). For the Rasch dataset, the latent score of each symptom measure was used and imputed instead of the ordinal sum of the reformulated measure. This resulted in 40 imputed datasets. The imputation was performed using a linear mixed model with predictive mean matching (2&#x202F;L.pmm) (<xref ref-type="bibr" rid="ref50">Van Buuren, 2018</xref>). To combine the results from the different imputations, Rubin&#x2019;s rules were used (<xref ref-type="bibr" rid="ref50">Van Buuren, 2018</xref>), which included the modified standard errors and degrees of freedom of the mean prediction across imputation sets to correct for the variance due to the imputation. Comparisons between models (including Welch&#x2019;s <italic>t</italic>-test) were performed based on these means and standard errors with an alpha level of 0.05 and using two-sided tests.</p>
</sec>
<sec id="sec12">
<title>Validation</title>
<p>We used nested cross-validation (NCV). An NCV procedure in conjunction with multiple imputations improves the validity of confidence intervals (<xref ref-type="bibr" rid="ref3">Bates et al., 2021</xref>). All hyperparameters were tuned in the inner CV loop to prevent overfitting (<xref ref-type="bibr" rid="ref3">Bates et al., 2021</xref>; <xref ref-type="bibr" rid="ref6">Cawley and Talbot, 2010</xref>). The outer CV loop consists of 10 splits, and the inner of five. Each of the 2 &#x00D7; 20 imputed datasets underwent the 10 &#x00D7; 5 CV loops. The inner CV loop determined the hyperparameter tuning, whereas the outer CV loop was used to compare the model performances.</p>
</sec>
<sec id="sec13">
<title>Prediction metrics</title>
<p>Primary evaluation was performed using the Root Mean Squared Error (RMSE). Based on the scaling of the symptoms 0&#x2013;1 the RMSE can be interpreted as the mean percentage error in the prediction. An RMSE of 0.1 would equal, on average, 10 percentage points from the true outcome in the prediction of the continuous outcome score.</p>
</sec>
</sec>
<sec sec-type="results" id="sec14">
<title>Results</title>
<p>The differences between the Base and Rasch datasets were very small, with less than a 0.0081-point difference in the RMSE score for each model in favor of the Base dataset (<xref ref-type="fig" rid="fig3">Figure 3</xref>). Thus, the Rasch dataset models had only a marginally worse score in RMSE, 5% (0.1389/0.1318) higher compared to the Base dataset models. For LR, the Base dataset had an RMSE of 0.1318 (95% CI, 0.1284, 0.1353), and Rasch had an RMSE of 0.1389 (95% CI, 0.1359, 0.1418) with a mean difference&#x202F;=&#x202F;&#x2212;0.0070, <italic>t</italic>(198.57)&#x202F;=&#x202F;&#x2212;3.09, <italic>p</italic>&#x202F;=&#x202F;0.0023. For BR, the Base dataset had an RMSE of 0.1318 (95% CI, 0.1284, 0.1352), and Rasch had an RMSE of 0.1388 (95% CI, 0.1358, 0.1418), with a mean difference&#x202F;=&#x202F;&#x2212;0.0070, <italic>t</italic>(205.44)&#x202F;=&#x202F;&#x2212;3.04, <italic>p</italic>&#x202F;=&#x202F;0.0027. For RF, Base had an RMSE of 0.1322 (95% CI, 0.1287, 0.1357), and Rasch had 0.1403 (95% CI, 0.1374, 0.1432), with a mean difference&#x202F;=&#x202F;&#x2212;0.0081, <italic>t</italic>(218.02)&#x202F;=&#x202F;&#x2212;3.51, <italic>p</italic>&#x202F;=&#x202F;0.0006. All models were significantly better than the null model of the dummy regression.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Root mean squared error for predicting symptom outcome. The root mean squared error (RMSE) mean and 95% CI were based on 20 imputed datasets for each dataset type. The RMSE can be interpreted as the mean percentage of incorrect predictions. The base dataset used the original ordinal sum scores of the questionnaires. The Rasch datasets used latent scores from psychometrically reformulated questionnaires. DR, Dummy regression, LR, Linear regression, BR, Bayesian ridge regression, RF, random forest. Weight corresponds to the weighted models using the inverse sum of the standard error for the latent scores, which was only available for the Rasch dataset.</p>
</caption>
<graphic xlink:href="fpsyg-17-1654996-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Scatter plot with error bars titled RMSE by model compares Root Mean Square Error for different models labeled DR, LR, BR, RF, LR-weight, BR-weight, and RF-weight. Two data series are shown: Base type in dark blue and Rasch type in orange. Base type results are only shown for DR, LR, BR, and RF, where Base type consistently achieves lower RMSE than Rasch type. Rasch type results are also shown for LR-weight, BR-weight, and RF-weight, with RMSE values higher than the Base type. A legend in the top right indicates colors for each type.</alt-text>
</graphic>
</fig>
<p>Using the standard errors as weights did not improve the predictive performance of the Rasch models, having an identical performance for each of the models (<xref ref-type="fig" rid="fig3">Figure 3</xref>).</p>
<p>While the original questionnaires were less psychometrically robust (<xref ref-type="bibr" rid="ref21">Hentati Isacsson and Johansson, 2025</xref>) than the shortened versions based on Rasch Measurement Theory, they maintained strong stochastic ordering, with higher sum scores reliably indicating (correlated) higher latent scores, as evidenced by correlations of 0.90&#x2013;0.95. <xref ref-type="table" rid="tab1">Table 1</xref> shows the superior fit of the one-factor CFA for the shortened version of each questionnaire based on the RMT. We note that while some minor misspecifications also exist for the shortened version, it is always superior to the original version of the questionnaire, and the fit metrics for the shorter questionnaires had a much larger margin than the cutoffs. Thus, this CFA supports the unidimensional quality of the reformulated questionnaires but not the original versions.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>One-factor CFA for original items or shortened versions was based on Rasch Measurement Theory.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Questionnaire</th>
<th align="left" valign="top">Items</th>
<th align="left" valign="top">
<inline-formula>
<mml:math id="M12">
<mml:msup>
<mml:mi>&#x03C7;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:math>
</inline-formula>
</th>
<th align="left" valign="top">df</th>
<th align="left" valign="top"><italic>p</italic></th>
<th align="left" valign="top">CFI</th>
<th align="left" valign="top">TLI</th>
<th align="left" valign="top">RMSEA</th>
<th align="left" valign="top">SRMR</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle" rowspan="2">MADRS-S</td>
<td align="left" valign="middle">Org.<sup>2</sup></td>
<td align="left" valign="middle">1542.964</td>
<td align="left" valign="middle">27</td>
<td align="left" valign="middle">0</td>
<td align="left" valign="middle">0.975</td>
<td align="left" valign="middle">0.966</td>
<td align="left" valign="middle">0.095 [0.091, 0.099]</td>
<td align="left" valign="middle">0.043</td>
</tr>
<tr>
<td align="left" valign="middle">Rasch<sup>&#x002A;</sup></td>
<td align="left" valign="middle">37.94</td>
<td align="left" valign="middle">2</td>
<td align="left" valign="middle">0</td>
<td align="left" valign="middle">0.999</td>
<td align="left" valign="middle">0.997</td>
<td align="left" valign="middle">0.054 [0.04, 0.069]</td>
<td align="left" valign="middle">0.009</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="2">PDSS-SR</td>
<td align="left" valign="middle">Org.<sup>3</sup></td>
<td align="left" valign="middle">815.2</td>
<td align="left" valign="middle">14</td>
<td align="left" valign="middle">0</td>
<td align="left" valign="middle">0.931</td>
<td align="left" valign="middle">0.897</td>
<td align="left" valign="middle">0.185 [0.174, 0.196]</td>
<td align="left" valign="middle">0.075</td>
</tr>
<tr>
<td align="left" valign="middle">Rasch<sup>1</sup></td>
<td align="left" valign="middle">44.625</td>
<td align="left" valign="middle">2</td>
<td align="left" valign="middle">0</td>
<td align="left" valign="middle">0.986</td>
<td align="left" valign="middle">0.958</td>
<td align="left" valign="middle">0.113 [0.085, 0.143]</td>
<td align="left" valign="middle">0.03</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="2">LSAS-SR</td>
<td align="left" valign="middle">Org.<sup>3</sup></td>
<td align="left" valign="middle">64324.799</td>
<td align="left" valign="middle">1,080</td>
<td align="left" valign="middle">0</td>
<td align="left" valign="middle">0.533</td>
<td align="left" valign="middle">0.513</td>
<td align="left" valign="middle">0.191 [0.189, 0.192]</td>
<td align="left" valign="middle">0.192</td>
</tr>
<tr>
<td align="left" valign="middle">Rasch<sup>2</sup></td>
<td align="left" valign="middle">223.817</td>
<td align="left" valign="middle">20</td>
<td align="left" valign="middle">0</td>
<td align="left" valign="middle">0.96</td>
<td align="left" valign="middle">0.944</td>
<td align="left" valign="middle">0.08 [0.07, 0.089]</td>
<td align="left" valign="middle">0.045</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>All metrics were scaled. MADRS-S was fit with one additional item to avoid a just-identified model (by adding item 6, see <xref ref-type="bibr" rid="ref21">Hentati Isacsson and Johansson, 2025</xref>). <sup>&#x002A;123</sup>Indicates the level of dynamic cut-off for CFI, RMSEA, and SRMR that the model was closest to, with &#x002A;indicating no model specification and 1, 2, and 3 indicating a small, medium, and large misspecification or non-conforming model to the unidimensional factor. MADRS-S, Montgomery-&#x00C5;sberg Depression Rating Scale Self-report; LSAS-SR, Leibowitz Social Anxiety Scale-Self Report; PDSS-SR, Panic Disorder Symptom Scale-Self Report. Org; original items of the questionnaire. Rasch: The shortened version of the questionnaire based on Rasch analysis. &#x1D712;2, model chi-square; Df, model degrees of freedom; p, model <italic>p</italic>-value; CFI, Comparative Fit Index, higher is better; TLI, Tucker&#x2013;Lewis index, higher is better; RMSEA, Root Mean Square Error of Approximation with 95% confidence intervals, lower is better; SRMR, Standardized Root Mean Square Residual, lower is better.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec sec-type="discussion" id="sec15">
<title>Discussion</title>
<p>Latent scores calculated using the psychometrically optimized, substantially shorter version, comprising 23% of the full scale, showed similar predictive performance (although slightly and negligibly worse) compared to the sum score of the full scale. Overall, the original ordinal sum scores had a marginally better performance, and there were no differences in performance across models. <xref ref-type="bibr" rid="ref24">Jacobucci and Grimm (2020)</xref> showed that varying only predictor reliability, and no other psychometric criteria, had a large impact on predictive performance, with underfitted models as a function of decreased reliability, and they did not simultaneously modify the outcome. <xref ref-type="bibr" rid="ref32">McNamara et al. (2022)</xref>, who modified both predictors and outcome at the same time, showed that there was no difference between predictive models, as was found in our results. At the same time, they showed that less reliable predictors and outcomes showed decreased predictive performance. Again, this result pertains only to modifying the noise or reliability of the predictors and outcome. Meanwhile, this study was reformulated according to four psychometric criteria using empirical data, and we do not have the same reliability estimates or control as in the simulation studies (<xref ref-type="bibr" rid="ref24">Jacobucci and Grimm, 2020</xref>; <xref ref-type="bibr" rid="ref32">McNamara et al., 2022</xref>).</p>
<p>As shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>, the reformulated questionnaires&#x2019; latent scores have a largely simple linear relationship with the original ordinal sums. While the reformulated questionnaires are more psychometrically robust and retain essential information, the original sum scores could contain useful predictive signals despite a greater amount of noise. The results are in line with those of <xref ref-type="bibr" rid="ref47">Sijtsma et al. (2024)</xref>, which indicate similar performance between sum scores and latent scores in a predictive framework. This suggests that while the reformulated questionnaires provide a more streamlined, unidimensional measure of the trait, as shown by the CFA, some variability present in the original scores may contribute to predictive performance. This is supported by the fact that the original sum scores show stochastic ordering with largely high correlations to the (what we assume to be) underlying latent trait, as estimated using Rasch analysis. While it is not perfectly linear at the periphery of the latent continuum (<xref ref-type="fig" rid="fig2">Figure 2</xref>), which a non-linear model such as random forest could detect, it remains largely linear overall, reminiscent of the empirical findings of <xref ref-type="bibr" rid="ref4">Bezruczko et al. (2016)</xref>. Consequently, because the format of the outcome variable in each case aligns with the format of the predictors, the predicted relationships remain largely linear and unchanged. Thus, based on both the experimental setup in this study and previous studies, it is not entirely unexpected that the predictive performance is similar.</p>
<p>However, we did not interpret the higher predictive performance of the sum score as reflective of a more truthful way of handling self-rated data. As clearly argued for and shown in the study by <xref ref-type="bibr" rid="ref46">Shmueli (2010)</xref>, predictive results cannot be used to draw inferences about which models are more truthful, but rather about which could be more predictive. Another possibility is that the Rasch-improved models improved precision and reduced overfitting. This resulted in a lower predictive performance but could be more generalizable (e.g., in overpredictive or other psychiatric settings) compared to the original sum scores. This hypothesis would need to be tested with other datasets from different settings.</p>
<p>Furthermore, while there was a statistically significant difference in the predictive performance, the absolute difference in the RMSE score was negligible. In addition, the latent scores used only a fraction of the original questionnaire items. This was an unintended consequence of the psychometrically valid versions of the questionnaires. The latent scores&#x2019; performance was worse by 5% relative to the sum scores, corresponding to an RMSE difference of 0.70%, which is a considerable retention of information considering that only 23% of the items were retained across all questionnaires. Fewer items could be beneficial because it could significantly reduce the response burden for patients and reduce the risk of missing data, which in turn could facilitate repeated and more widespread measurements within routine care.</p>
<p>While we also trained models that weighted their predictions based on the inverse sum of the standard errors of the latent scores, the information from this weight did not improve the prediction compared with not using the weights. While not explicitly containing the weight variable in the non-weighted models, the non-weighted models did have access to the standard errors, which composed the weights, and these possibly had a larger influence than the composite. Future studies could explore other ways to utilize the standard errors, perhaps using another predictive framework that implements the standard error explicitly in the model (<xref ref-type="bibr" rid="ref29">Kurz, 2023</xref>).</p>
<p>Since both the predictor and outcome changed simultaneously in the two conditions, a future study might investigate an independent set of predictors to predict the two different types of outcomes instead. This could possibly reveal whether a more psychometrically solid outcome variable could improve predictive performance. In addition, a future study could investigate if latent scores from the original questionnaires, without improving their psychometric qualities, have an impact on predictive performance. Furthermore, previous research has indicated improved predictive performance using items as predictors, as opposed to their summation (<xref ref-type="bibr" rid="ref35">McNeish, 2024</xref>). While this is counter to our previous research in a similar setting (<xref ref-type="bibr" rid="ref20">Hentati Isacsson et al., 2024a</xref>), it could be valuable in a setting that also investigates latent scores compared to a simple summation. The similar predictive performance of the latent score setup, despite using only 23% of the items, is beneficial. This study did not aim to investigate methods of shortening questionnaires and their subsequent impact on predictive performance. To investigate these aspects, another methodological setup would be needed, e.g., using the same subset of items. Such a study could also incorporate the possible impact of missing data on the analyses, such as comparisons of the impact of missing data, where one could simulate missing data and compare complete-case analyses and imputation setups. Additionally, this experimental setup could benefit from longitudinal models that take the repeated structure of the data into account, despite previous findings. Finally, there has been recent progress in predictive models with measurement errors, which indicates that it could be beneficial to predict intervals and use these instead of point predictions (<xref ref-type="bibr" rid="ref26">Jiang and Ma, 2024</xref>).</p>
</sec>
<sec sec-type="conclusions" id="sec16">
<title>Conclusion</title>
<p>In conclusion, using empirical data from psychological treatment, our findings indicate that using latent scores as predictors and outcomes from a psychometrically improved version of the questionnaire showed similar predictive performance to the original ordinal sum scores. While the psychometric properties were improved by the Rasch analyses, it is inconclusive whether this also improved precision and reduced overfitting or if the Base dataset retained useful variability. For the models using the latent score, their predictive performance was marginally reduced by 5% (a 0.70% RMSE increase) while using only 23% of the original items. This suggests that while reformulated questionnaires can streamline measurement and lower the burden on patients, their impact on improving predictive performance in this study was limited.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec17">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="ethics-statement" id="sec18">
<title>Ethics statement</title>
<p>The studies involving humans were approved by the Regional ethical review board in Stockholm (Dnr: 2011/2091&#x2013;31/3, amendment 2016/21&#x2013;32, 2017/2320&#x2013;32, and 2018/2550&#x2013;32). The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x2019; legal guardians/next of kin because in Sweden, along the GDPR, the Swedish Patient Data Act (SFS 2008:355) is in legislation. It determines that data gathered in routine care setting such as EHR data only requires an opt-out in order to allow a secondary use for research. Our data was collected in such a setting and belongs to the same category which is why the ethical board approved its use. The EU/GDPR basis of this law is data processing in the public interest, While most other countries only apply this exemption on a case-to-case basis, Sweden has made the general ruling for research on health care data. This law is also the legal basis for many studies using the so-called Swedish Registry data none of which require written consent. While we understand it is unusual in the international research context, in Sweden neither explicit written consent, nor an exemption from the ethical board is necessary under the circumstances that apply for the kind of research this study is. It is in this form the ethical review board also approved this study.</p>
</sec>
<sec sec-type="author-contributions" id="sec19">
<title>Author contributions</title>
<p>NH: Data curation, Writing &#x2013; original draft, Methodology, Conceptualization, Software, Investigation, Visualization, Project administration, Validation, Formal analysis, Writing &#x2013; review &#x0026; editing. MJ: Validation, Software, Conceptualization, Visualization, Methodology, Supervision, Investigation, Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft. VK: Data curation, Resources, Funding acquisition, Conceptualization, Project administration, Writing &#x2013; review &#x0026; editing, Supervision.</p>
</sec>
<sec sec-type="COI-statement" id="sec20">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec21">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec22">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec23">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fpsyg.2026.1654996/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fpsyg.2026.1654996/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.DOCX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_1.CSV" id="SM2" mimetype="text/comma-separated-values" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_2.ZIP" id="SM3" mimetype="application/zip" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Andrich</surname><given-names>D.</given-names></name> <name><surname>Marais</surname><given-names>I.</given-names></name></person-group> (<year>2019</year>). <source>A course in Rasch measurement theory: measuring in the educational, social and health sciences</source>. <publisher-name>Singapore: Springer Singapore</publisher-name>.</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barkham</surname><given-names>M.</given-names></name> <name><surname>De Jong</surname><given-names>K.</given-names></name> <name><surname>Delgadillo</surname><given-names>J.</given-names></name> <name><surname>Lutz</surname><given-names>W.</given-names></name></person-group> (<year>2023</year>). <article-title>Routine outcome monitoring (ROM) and feedback: research review and recommendations</article-title>. <source>Psychother. Res.</source> <volume>33</volume>, <fpage>841</fpage>&#x2013;<lpage>855</lpage>. doi: <pub-id pub-id-type="doi">10.1080/10503307.2023.2181114</pub-id>, <pub-id pub-id-type="pmid">36931228</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Bates</surname><given-names>S.</given-names></name> <name><surname>Hastie</surname><given-names>T.</given-names></name> <name><surname>Tibshirani</surname><given-names>R.</given-names></name></person-group> (<year>2021</year>). <article-title>Cross-validation: what does it estimate and how well does it do it?</article-title>. <comment>arXiv:2104.00673</comment>.</mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bezruczko</surname><given-names>N.</given-names></name> <name><surname>Fatani</surname><given-names>S. S.</given-names></name> <name><surname>Magari</surname><given-names>N.</given-names></name></person-group> (<year>2016</year>). <article-title>Three Tales of change: ordinal scores, Residualized gains, and Rasch logits&#x2014;when are they interchangeable?</article-title> <source>SAGE Open</source> <volume>6</volume>:<fpage>2158244016659905</fpage>. doi: <pub-id pub-id-type="doi">10.1177/2158244016659905</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Breiman</surname><given-names>L.</given-names></name></person-group> (<year>2001</year>). <article-title>Random forests</article-title>. <source>Mach. Learn.</source> <volume>45</volume>, <fpage>5</fpage>&#x2013;<lpage>32</lpage>.</mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cawley</surname><given-names>G. C.</given-names></name> <name><surname>Talbot</surname><given-names>N. L. C.</given-names></name></person-group> (<year>2010</year>). <article-title>On over-fitting in model selection and subsequent selection Bias in performance evaluation</article-title>. <source>J. Mach. Learn. Res.</source> <volume>11</volume>, <fpage>2079</fpage>&#x2013;<lpage>2107</lpage>. doi: <pub-id pub-id-type="doi">10.5555/1756006.185992110.5555</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Christensen</surname><given-names>K. B.</given-names></name> <name><surname>Kreiner</surname><given-names>S.</given-names></name> <name><surname>Mesbah</surname><given-names>M.</given-names></name></person-group> (<year>2013</year>). <source>Rasch models in health.</source> <publisher-name>Great Britain, United States: ISTE Ltd and John Wiley &#x0026; Sons</publisher-name>.</mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cohen</surname><given-names>P.</given-names></name> <name><surname>Cohen</surname><given-names>J.</given-names></name> <name><surname>Aiken</surname><given-names>L. S.</given-names></name> <name><surname>West</surname><given-names>S. G.</given-names></name></person-group> (<year>1999</year>). <article-title>The problem of units and the circumstance for POMP</article-title>. <source>Multivar. Behav. Res.</source> <volume>34</volume>, <fpage>315</fpage>&#x2013;<lpage>346</lpage>. doi: <pub-id pub-id-type="doi">10.1207/S15327906MBR3403_2</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Collins</surname><given-names>G. S.</given-names></name> <name><surname>Moons</surname><given-names>K. G. M.</given-names></name> <name><surname>Dhiman</surname><given-names>P.</given-names></name> <name><surname>Riley</surname><given-names>R. D.</given-names></name> <name><surname>Beam</surname><given-names>A. L.</given-names></name> <name><surname>Calster</surname><given-names>B. V.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods</article-title>. <source>BMJ</source> <volume>385</volume>:<fpage>e078378</fpage>. doi: <pub-id pub-id-type="doi">10.1136/bmj-2023-078378</pub-id>, <pub-id pub-id-type="pmid">38626948</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>El Alaoui</surname><given-names>S.</given-names></name> <name><surname>Hedman</surname><given-names>E.</given-names></name> <name><surname>Kaldo</surname><given-names>V.</given-names></name> <name><surname>Hesser</surname><given-names>H.</given-names></name> <name><surname>Kraepelien</surname><given-names>M.</given-names></name> <name><surname>Andersson</surname><given-names>E.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Effectiveness of internet-based cognitive&#x2013;behavior therapy for social anxiety disorder in clinical psychiatry</article-title>. <source>J. Consult. Clin. Psychol.</source> <volume>83</volume>, <fpage>902</fpage>&#x2013;<lpage>914</lpage>. doi: <pub-id pub-id-type="doi">10.1037/a0039198</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Elson</surname><given-names>M.</given-names></name> <name><surname>Hussey</surname><given-names>I.</given-names></name> <name><surname>Alsalti</surname><given-names>T.</given-names></name> <name><surname>Arslan</surname><given-names>R. C.</given-names></name></person-group> (<year>2023</year>). <article-title>Psychological measures aren&#x2019;t toothbrushes</article-title>. <source>Commun. Psychol.</source> <volume>1</volume>, <fpage>1</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s44271-023-00026-9</pub-id>, <pub-id pub-id-type="pmid">39242966</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fisher</surname><given-names>W. P.</given-names></name> <name><surname>Burton</surname><given-names>E. C.</given-names></name></person-group> (<year>2010</year>). <article-title>Embedding measurement within existing computerized data systems: scaling clinical laboratory and medical records heart failure data to predict ICU admission</article-title>. <source>J. Appl. Meas.</source> <volume>11</volume>, <fpage>271</fpage>&#x2013;<lpage>287</lpage>.</mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Flake</surname><given-names>J. K.</given-names></name> <name><surname>Davidson</surname><given-names>I. J.</given-names></name> <name><surname>Wong</surname><given-names>O.</given-names></name> <name><surname>Pek</surname><given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Construct validity and the validity of replication studies: a systematic review</article-title>. <source>Am. Psychol.</source> <volume>77</volume>, <fpage>576</fpage>&#x2013;<lpage>588</lpage>. doi: <pub-id pub-id-type="doi">10.1037/amp0001006</pub-id>, <pub-id pub-id-type="pmid">35482669</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Flake</surname><given-names>J. K.</given-names></name> <name><surname>Fried</surname><given-names>E. I.</given-names></name></person-group> (<year>2020</year>). <article-title>Measurement schmeasurement: questionable measurement practices and how to avoid them</article-title>. <source>Adv. Methods Pract. Psychol. Sci.</source> <volume>3</volume>, <fpage>456</fpage>&#x2013;<lpage>465</lpage>. doi: <pub-id pub-id-type="doi">10.1177/2515245920952393</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Forsell</surname><given-names>E.</given-names></name> <name><surname>Jernel&#x00F6;v</surname><given-names>S.</given-names></name> <name><surname>Blom</surname><given-names>K.</given-names></name> <name><surname>Kraepelien</surname><given-names>M.</given-names></name> <name><surname>Svanborg</surname><given-names>C.</given-names></name> <name><surname>Andersson</surname><given-names>G.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Proof of concept for an adaptive treatment strategy to prevent failures in internet-delivered CBT: a single-blind randomized clinical trial with insomnia patients</article-title>. <source>Am. J. Psychiatry</source> <volume>176</volume>, <fpage>315</fpage>&#x2013;<lpage>323</lpage>. doi: <pub-id pub-id-type="doi">10.1176/appi.ajp.2018.18060699</pub-id>, <pub-id pub-id-type="pmid">30696270</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fresco</surname><given-names>D. M.</given-names></name> <name><surname>Coles</surname><given-names>M. E.</given-names></name> <name><surname>Heimberg</surname><given-names>R. G.</given-names></name> <name><surname>Liebowitz</surname><given-names>M. R.</given-names></name> <name><surname>Hami</surname><given-names>S.</given-names></name> <name><surname>Stein</surname><given-names>M. B.</given-names></name> <etal/></person-group>. (<year>2001</year>). <article-title>The Liebowitz social anxiety scale: a comparison of the psychometric properties of self-report and clinician-administered formats</article-title>. <source>Psychol. Med.</source> <volume>31</volume>, <fpage>1025</fpage>&#x2013;<lpage>1035</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S0033291701004056</pub-id>, <pub-id pub-id-type="pmid">11513370</pub-id></mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Grund</surname><given-names>S.</given-names></name> <name><surname>L&#x00FC;dtke</surname><given-names>O.</given-names></name> <name><surname>Robitzsch</surname><given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>Multiple imputation of missing data for multilevel models: simulations and recommendations</article-title>. <source>Organ. Res. Methods</source> <volume>21</volume>, <fpage>111</fpage>&#x2013;<lpage>149</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1094428117703686</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hedman</surname><given-names>E.</given-names></name> <name><surname>Lj&#x00F3;tsson</surname><given-names>B.</given-names></name> <name><surname>Kaldo</surname><given-names>V.</given-names></name> <name><surname>Hesser</surname><given-names>H.</given-names></name> <name><surname>El Alaoui</surname><given-names>S.</given-names></name> <name><surname>Kraepelien</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Effectiveness of internet-based cognitive behaviour therapy for depression in routine psychiatric care</article-title>. <source>J. Affect. Disord.</source> <volume>155</volume>, <fpage>49</fpage>&#x2013;<lpage>58</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jad.2013.10.023</pub-id>, <pub-id pub-id-type="pmid">24238951</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hedman</surname><given-names>E.</given-names></name> <name><surname>Lj&#x00F3;tsson</surname><given-names>B.</given-names></name> <name><surname>R&#x00FC;ck</surname><given-names>C.</given-names></name> <name><surname>Bergstr&#x00F6;m</surname><given-names>J.</given-names></name> <name><surname>Andersson</surname><given-names>G.</given-names></name> <name><surname>Kaldo</surname><given-names>V.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>Effectiveness of internet-based cognitive behaviour therapy for panic disorder in routine psychiatric care</article-title>. <source>Acta Psychiatr. Scand.</source> <volume>128</volume>, <fpage>457</fpage>&#x2013;<lpage>467</lpage>. doi: <pub-id pub-id-type="doi">10.1111/acps.12079</pub-id>, <pub-id pub-id-type="pmid">23406572</pub-id></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hentati Isacsson</surname><given-names>N.</given-names></name> <name><surname>Ben Abdesslem</surname><given-names>F.</given-names></name> <name><surname>Forsell</surname><given-names>E.</given-names></name> <name><surname>Boman</surname><given-names>M.</given-names></name> <name><surname>Kaldo</surname><given-names>V.</given-names></name></person-group> (<year>2024a</year>). <article-title>Methodological choices and clinical usefulness for machine learning predictions of outcome in internet-based cognitive behavioural therapy</article-title>. <source>Commun. Med.</source> <volume>4</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s43856-024-00626-4</pub-id>, <pub-id pub-id-type="pmid">39384934</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Hentati Isacsson</surname><given-names>N.</given-names></name> <name><surname>Johansson</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <source>Three psychometric evals</source>. Available online at: <ext-link xlink:href="https://intraverbal.github.io/" ext-link-type="uri">https://intraverbal.github.io/</ext-link></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hentati Isacsson</surname><given-names>N.</given-names></name> <name><surname>Zantvoort</surname><given-names>K.</given-names></name> <name><surname>Forsell</surname><given-names>E.</given-names></name> <name><surname>Boman</surname><given-names>M.</given-names></name> <name><surname>Kaldo</surname><given-names>V.</given-names></name></person-group> (<year>2024b</year>). <article-title>Making the most out of timeseries symptom data: a machine learning study on symptom predictions of internet-based CBT</article-title>. <source>Internet Interv.</source> <volume>38</volume>:<fpage>100773</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.invent.2024.100773</pub-id>, <pub-id pub-id-type="pmid">39310714</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Houck</surname><given-names>P. R.</given-names></name> <name><surname>Spiegel</surname><given-names>D. A.</given-names></name> <name><surname>Shear</surname><given-names>M. K.</given-names></name> <name><surname>Rucci</surname><given-names>P.</given-names></name></person-group> (<year>2002</year>). <article-title>Reliability of the self-report version of the panic disorder severity scale</article-title>. <source>Depress. Anxiety</source> <volume>15</volume>, <fpage>183</fpage>&#x2013;<lpage>185</lpage>. doi: <pub-id pub-id-type="doi">10.1002/da.10049</pub-id>, <pub-id pub-id-type="pmid">12112724</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jacobucci</surname><given-names>R.</given-names></name> <name><surname>Grimm</surname><given-names>K. J.</given-names></name></person-group> (<year>2020</year>). <article-title>Machine learning and psychological research: the unexplored effect of measurement</article-title>. <source>Perspect. Psychol. Sci.</source> <volume>15</volume>, <fpage>809</fpage>&#x2013;<lpage>816</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1745691620902467</pub-id>, <pub-id pub-id-type="pmid">32348703</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Jaeger</surname><given-names>B. C.</given-names></name> <name><surname>Tierney</surname><given-names>N. J.</given-names></name> <name><surname>Simon</surname><given-names>N. R.</given-names></name></person-group> (<year>2020</year>). <article-title>When to impute? Imputation before and during cross-validation</article-title>. <comment>arXiv:2010.00718 [Cs, Stat]</comment>.</mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Jiang</surname><given-names>F.</given-names></name> <name><surname>Ma</surname><given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Prediction in measurement error models (no. arXiv:2405.10461; version 1)</article-title> <comment>arXiv</comment>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2405.10461</pub-id></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Johansson</surname><given-names>M.</given-names></name> <name><surname>Preuter</surname><given-names>M.</given-names></name> <name><surname>Karlsson</surname><given-names>S.</given-names></name> <name><surname>M&#x00F6;llerberg</surname><given-names>M.-L.</given-names></name> <name><surname>Svensson</surname><given-names>H.</given-names></name> <name><surname>Melin</surname><given-names>J.</given-names></name></person-group> (<year>2023</year>). <source>Valid and reliable? Basic and expanded recommendations for psychometric reporting and quality assessment</source>: <publisher-name>OSF</publisher-name>.</mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kreiner</surname><given-names>S.</given-names></name></person-group> (<year>2007</year>). <article-title>Validity and objectivity: reflections on the role and nature of Rasch models</article-title>. <source>Nord. Psychol.</source> <volume>59</volume>, <fpage>268</fpage>&#x2013;<lpage>298</lpage>. doi: <pub-id pub-id-type="doi">10.1027/1901-2276.59.3.268</pub-id></mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Kurz</surname><given-names>A. S.</given-names></name></person-group> (<year>2023</year>). <source>Statistical rethinking with brms, ggplot2, and the tidyverse: Second edition (version 0.4.0)</source>. Available online at: <ext-link xlink:href="https://bookdown.org/content/4857/" ext-link-type="uri">https://bookdown.org/content/4857/</ext-link></mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lilienfeld</surname><given-names>S. O.</given-names></name> <name><surname>Strother</surname><given-names>A. N.</given-names></name></person-group> (<year>2020</year>). <article-title>Psychological measurement and the replication crisis: four sacred cows</article-title>. <source>Can. Psychol. Psychol. Can.</source> <volume>61</volume>, <fpage>281</fpage>&#x2013;<lpage>288</lpage>. doi: <pub-id pub-id-type="doi">10.1037/cap0000236</pub-id></mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Luijken</surname><given-names>K.</given-names></name> <name><surname>Groenwold</surname><given-names>R. H. H.</given-names></name> <name><surname>Van Calster</surname><given-names>B.</given-names></name> <name><surname>Steyerberg</surname><given-names>E. W.</given-names></name> <name><surname>van Smeden</surname><given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>Impact of predictor measurement heterogeneity across settings on the performance of prediction models: a measurement error perspective</article-title>. <source>Stat. Med.</source> <volume>38</volume>, <fpage>3444</fpage>&#x2013;<lpage>3459</lpage>. doi: <pub-id pub-id-type="doi">10.1002/sim.8183</pub-id>, <pub-id pub-id-type="pmid">31148207</pub-id></mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McNamara</surname><given-names>M. E.</given-names></name> <name><surname>Zisser</surname><given-names>M.</given-names></name> <name><surname>Beevers</surname><given-names>C. G.</given-names></name> <name><surname>Shumake</surname><given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Not just &#x201C;big&#x201D; data: importance of sample size, measurement error, and uninformative predictors for developing prognostic models for digital interventions</article-title>. <source>Behav. Res. Ther.</source> <volume>153</volume>:<fpage>104086</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.brat.2022.104086</pub-id>, <pub-id pub-id-type="pmid">35462242</pub-id></mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McNeish</surname><given-names>D.</given-names></name></person-group> (<year>2018</year>). <article-title>Thanks coefficient alpha, we&#x2019;ll take it from here</article-title>. <source>Psychol. Methods</source> <volume>23</volume>, <fpage>412</fpage>&#x2013;<lpage>433</lpage>. doi: <pub-id pub-id-type="doi">10.1037/met0000144</pub-id>, <pub-id pub-id-type="pmid">28557467</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McNeish</surname><given-names>D.</given-names></name></person-group> (<year>2022</year>). <article-title>Limitations of the sum-and-alpha approach to measurement in behavioral research</article-title>. <source>Policy Insights Behav. Brain Sci.</source> <volume>9</volume>, <fpage>196</fpage>&#x2013;<lpage>203</lpage>. doi: <pub-id pub-id-type="doi">10.1177/23727322221117144</pub-id></mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McNeish</surname><given-names>D.</given-names></name></person-group> (<year>2024</year>). <article-title>Practical implications of sum scores being psychometrics&#x2019; greatest accomplishment</article-title>. <source>Psychometrika</source> <volume>89</volume>, <fpage>1148</fpage>&#x2013;<lpage>1169</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11336-024-09988-z</pub-id>, <pub-id pub-id-type="pmid">39031300</pub-id></mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>McNeish</surname><given-names>D.</given-names></name> <name><surname>Wolf</surname><given-names>M. G.</given-names></name></person-group> (<year>2020a</year>). <source>Dynamic fit index cutoffs for confirmatory factor analysis models</source>: <publisher-name>OSF</publisher-name>.</mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McNeish</surname><given-names>D.</given-names></name> <name><surname>Wolf</surname><given-names>M. G.</given-names></name></person-group> (<year>2020b</year>). <article-title>Thinking twice about sum scores</article-title>. <source>Behav. Res. Methods</source> <volume>52</volume>, <fpage>2287</fpage>&#x2013;<lpage>2305</lpage>. doi: <pub-id pub-id-type="doi">10.3758/s13428-020-01398-0</pub-id>, <pub-id pub-id-type="pmid">32323277</pub-id></mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Michell</surname><given-names>J.</given-names></name></person-group> (<year>1997</year>). <article-title>Quantitative science and the definition of measurement in psychology</article-title>. <source>Br. J. Psychol.</source> <volume>88</volume>, <fpage>355</fpage>&#x2013;<lpage>383</lpage>. doi: <pub-id pub-id-type="doi">10.1111/j.2044-8295.1997.tb02641.x</pub-id></mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Montgomery</surname><given-names>S. A.</given-names></name> <name><surname>Asberg</surname><given-names>M.</given-names></name></person-group> (<year>1979</year>). <article-title>A new depression scale designed to be sensitive to change</article-title>. <source>Br. J. Psychiatry</source> <volume>134</volume>, <fpage>382</fpage>&#x2013;<lpage>389</lpage>. doi: <pub-id pub-id-type="doi">10.1192/bjp.134.4.382</pub-id>, <pub-id pub-id-type="pmid">444788</pub-id></mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pedregosa</surname><given-names>F.</given-names></name> <name><surname>Varoquaux</surname><given-names>G.</given-names></name> <name><surname>Gramfort</surname><given-names>A.</given-names></name> <name><surname>Michel</surname><given-names>V.</given-names></name> <name><surname>Thirion</surname><given-names>B.</given-names></name> <name><surname>Grisel</surname><given-names>O.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. doi: <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pendrill</surname><given-names>L. R.</given-names></name></person-group> (<year>2018</year>). <article-title>Assuring measurement quality in person-centred healthcare</article-title>. <source>Meas. Sci. Technol.</source> <volume>29</volume>:<fpage>034003</fpage>. doi: <pub-id pub-id-type="doi">10.1088/1361-6501/aa9cd2</pub-id></mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pendrill</surname><given-names>L. R.</given-names></name> <name><surname>Melin</surname><given-names>J.</given-names></name> <name><surname>Stavelin</surname><given-names>A.</given-names></name> <name><surname>Nordin</surname><given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>Modernising receiver operating characteristic (ROC) curves</article-title>. <source>Algorithms</source> <volume>16</volume>:<fpage>1</fpage>&#x2013;<lpage>22</lpage>. doi: <pub-id pub-id-type="doi">10.3390/a16050253</pub-id></mixed-citation></ref>
<ref id="ref43"><mixed-citation publication-type="other"><collab id="coll1">Python Software Foundation</collab> (<year>2023</year>) <source>Python programming language</source>. Available online at: <ext-link xlink:href="https://www.python.org/" ext-link-type="uri">https://www.python.org/</ext-link></mixed-citation></ref>
<ref id="ref44"><mixed-citation publication-type="other"><collab id="coll2">R Core Team</collab> (<year>2024</year>) <source>R: a language and environment for statistical computing R foundation for statistical computing</source>. Available online at: <ext-link xlink:href="https://www.R-project.org/" ext-link-type="uri">https://www.R-project.org/</ext-link> (Accessed December, 12, 2025).</mixed-citation></ref>
<ref id="ref45"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rosseel</surname><given-names>Y.</given-names></name></person-group> (<year>2012</year>). <article-title>Lavaan: an R package for structural equation modeling</article-title>. <source>J. Stat. Softw.</source> <volume>48</volume>, <fpage>1</fpage>&#x2013;<lpage>36</lpage>. doi: <pub-id pub-id-type="doi">10.18637/jss.v048.i02</pub-id></mixed-citation></ref>
<ref id="ref46"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shmueli</surname><given-names>G.</given-names></name></person-group> (<year>2010</year>). <article-title>To explain or to predict?</article-title> <source>Stat. Sci.</source> <volume>25</volume>, <fpage>289</fpage>&#x2013;<lpage>310</lpage>. doi: <pub-id pub-id-type="doi">10.1214/10-STS330</pub-id></mixed-citation></ref>
<ref id="ref47"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sijtsma</surname><given-names>K.</given-names></name> <name><surname>Ellis</surname><given-names>J. L.</given-names></name> <name><surname>Borsboom</surname><given-names>D.</given-names></name></person-group> (<year>2024</year>). <article-title>Recognize the value of the sum score, psychometrics&#x2019; greatest accomplishment</article-title>. <source>Psychometrika</source> <volume>89</volume>, <fpage>84</fpage>&#x2013;<lpage>117</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11336-024-09964-7</pub-id>, <pub-id pub-id-type="pmid">38627311</pub-id></mixed-citation></ref>
<ref id="ref48"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tipping</surname><given-names>M. E.</given-names></name></person-group> (<year>2001</year>). <article-title>Sparse Bayesian learning and the relevance vector machine</article-title>. <source>J. Mach. Learn. Res.</source> <volume>1</volume>, <fpage>211</fpage>&#x2013;<lpage>244</lpage>.</mixed-citation></ref>
<ref id="ref49"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Titov</surname><given-names>N.</given-names></name> <name><surname>Dear</surname><given-names>B.</given-names></name> <name><surname>Nielssen</surname><given-names>O.</given-names></name> <name><surname>Staples</surname><given-names>L.</given-names></name> <name><surname>Hadjistavropoulos</surname><given-names>H.</given-names></name> <name><surname>Nugent</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>ICBT in routine care: a descriptive analysis of successful clinics in five countries</article-title>. <source>Internet Interv.</source> <volume>13</volume>, <fpage>108</fpage>&#x2013;<lpage>115</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.invent.2018.07.006</pub-id>, <pub-id pub-id-type="pmid">30206525</pub-id></mixed-citation></ref>
<ref id="ref50"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Van Buuren</surname><given-names>S.</given-names></name></person-group> (<year>2018</year>). <source>Flexible imputation of missing data</source>. <edition>2nd</edition> Edn: <publisher-name>CRC press</publisher-name>. <ext-link xlink:href="https://stefvanbuuren.name" ext-link-type="uri">https://stefvanbuuren.name</ext-link></mixed-citation></ref>
<ref id="ref51"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>van Buuren</surname><given-names>S.</given-names></name> <name><surname>Groothuis-Oudshoorn</surname><given-names>K.</given-names></name></person-group> (<year>2011</year>). <article-title>Mice: multivariate imputation by chained equations in R</article-title>. <source>J. Stat. Softw.</source> <volume>45</volume>, <fpage>1</fpage>&#x2013;<lpage>67</lpage>. doi: <pub-id pub-id-type="doi">10.18637/jss.v045.i03</pub-id></mixed-citation></ref>
<ref id="ref52"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>van Ginkel</surname><given-names>J. R.</given-names></name> <name><surname>Linting</surname><given-names>M.</given-names></name> <name><surname>Rippe</surname><given-names>R. C. A.</given-names></name> <name><surname>van der Voort</surname><given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>Rebutting existing misconceptions about multiple imputation as a method for handling missing data</article-title>. <source>J. Pers. Assess.</source> <volume>102</volume>, <fpage>297</fpage>&#x2013;<lpage>308</lpage>. doi: <pub-id pub-id-type="doi">10.1080/00223891.2018.1530680</pub-id>, <pub-id pub-id-type="pmid">30657714</pub-id></mixed-citation></ref>
<ref id="ref53"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Warm</surname><given-names>T. A.</given-names></name></person-group> (<year>1989</year>). <article-title>Weighted likelihood estimation of ability in item response theory</article-title>. <source>Psychometrika</source> <volume>54</volume>, <fpage>427</fpage>&#x2013;<lpage>450</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF02294627</pub-id></mixed-citation></ref>
<ref id="ref54"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Widaman</surname><given-names>K. F.</given-names></name> <name><surname>Revelle</surname><given-names>W.</given-names></name></person-group> (<year>2023</year>). <article-title>Thinking thrice about sum scores, and then some more about measurement and analysis</article-title>. <source>Behav. Res. Methods</source> <volume>55</volume>, <fpage>788</fpage>&#x2013;<lpage>806</lpage>. doi: <pub-id pub-id-type="doi">10.3758/s13428-022-01849-w</pub-id>, <pub-id pub-id-type="pmid">35469086</pub-id></mixed-citation></ref>
<ref id="ref55"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zantvoort</surname><given-names>K.</given-names></name> <name><surname>Hentati Isacsson</surname><given-names>N.</given-names></name> <name><surname>Funk</surname><given-names>B.</given-names></name> <name><surname>Kaldo</surname><given-names>V.</given-names></name></person-group> (<year>2024</year>). <article-title>Dataset size versus homogeneity: a machine learning study on pooling intervention data in e-mental health dropout predictions</article-title>. <source>DIGITAL HEALTH</source> <volume>10</volume>:<fpage>20552076241248920</fpage>. doi: <pub-id pub-id-type="doi">10.1177/20552076241248920</pub-id>, <pub-id pub-id-type="pmid">38757087</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/162779/overview">Mariagrazia Benassi</ext-link>, University of Bologna, Italy</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/913057/overview">Arthur Trognon</ext-link>, CLINICOG, France</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3292376/overview">Xiaodan Tang</ext-link>, Feinberg School of Medicine Northwestern University, United States</p>
</fn>
</fn-group>
</back>
</article>