<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="EN" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Educ.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Education</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Educ.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2504-284X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/feduc.2026.1759878</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>An atomized approach to assessing energy problem solving in physics using multidimensional item response theory</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Meyer</surname> <given-names>Andr&#x00E9;</given-names></name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2219779/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Friege</surname> <given-names>Gunnar</given-names></name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="http://loop.frontiersin.org/people/3367116/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
</contrib-group>
<aff id="aff1"><institution>Department of Mathematics and Physics, Institute for Mathematics and Physics Education, Physics Education Group, Leibniz University Hannover</institution>, <city>Hanover</city>, <country country="de">Germany</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Andr&#x00E9; Meyer, <email xlink:href="mailto:a.meyer@idmp.uni-hannover.de">a.meyer@idmp.uni-hannover.de</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-13">
<day>13</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>11</volume>
<elocation-id>1759878</elocation-id>
<history>
<date date-type="received">
<day>03</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>20</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>22</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Meyer and Friege.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Meyer and Friege</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-13">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Problem solving is a central competence in STEM education, yet many secondary school students struggle to coordinate the multiple skills required for successful problem solving. Early assessment of problem-solving skills can support individual feedback during this pivotal phase of schooling. However, existing assessment approaches focus mainly on complete problem solutions, which are resource-intensive and cannot adequately capture skills of students who fail in early phases of the problem-solving process.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address this gap, the atomized problem-solving test (APST) was developed as a digital instrument that independently assesses four problem-solving subprocesses: Representation, Planning, Execution, and Evaluation. The APST was evaluated in two consecutive studies with a total of 800 German secondary school students within a web-based learning environment on energy conservation. Multidimensional item response theory (MIRT) was used to examine item quality and dimensional structure, complemented by supplemental assessments of conceptual knowledge, school grades, and rubric-based analyses of written problem solutions.</p>
</sec>
<sec>
<title>Results</title>
<p>The analyses supported a four-dimensional structure aligned with the theoretical design of the APST. The items showed acceptable model fit and reliable measurement of the intended subprocesses. All APST dimensions were moderately associated with conceptual knowledge of energy and with school grades in physics and mathematics, while no meaningful correlations were found with gender or native language. Evaluation emerged as a distinctive subprocess, showing strong associations with other subprocesses&#x2013;particularly Execution&#x2013;alongside evaluation-specific skills.</p>
</sec>
<sec>
<title>Discussion</title>
<p>The results indicate that the APST enables valid and reliable assessment of problem-solving subprocess skills in secondary physics education. At the same time, the findings underscore limitations of atomized assessments for measuring general problem-solving competence, as independent decision making is not assessed. The prominent role of Evaluation highlights its integrative function within the problem-solving process and points to important implications for both assessment design and future research.</p>
</sec>
</abstract>
<kwd-group>
<kwd>assessment</kwd>
<kwd>energy</kwd>
<kwd>multidimensional item response theory</kwd>
<kwd>physics education</kwd>
<kwd>problem solving</kwd>
<kwd>secondary school teaching</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported within the Project &#x201C;LernMINT - data-driven teaching in STEM subjects&#x201D; by the Ministry of Science and Culture Lower Saxony (grant number 51410078) and by the German Academic Scholarship Foundation (Doctoral Scholarship/No grant number). The publication of this article was funded by the Open Access Fund of Leibniz Universit&#x00E4;t Hannover.</funding-statement>
</funding-group>
<counts>
<fig-count count="1"/>
<table-count count="4"/>
<equation-count count="0"/>
<ref-count count="52"/>
<page-count count="12"/>
<word-count count="10230"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>STEM Education</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="S1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Problem solving is a complex and challenging task that is considered to be one of the most important skills students have to achieve in STEM education (<xref ref-type="bibr" rid="B21">Jang, 2016</xref>). It requires multiple cognitive abilities (<xref ref-type="bibr" rid="B47">Tr&#x00E4;ff et al., 2019</xref>). Especially quantitative physics problem solving requires solid conceptual knowledge and mathematical skills that have to be applied to unfamiliar situations (<xref ref-type="bibr" rid="B36">OECD, 2023</xref>; <xref ref-type="bibr" rid="B46">Tong et al., 2025</xref>; <xref ref-type="bibr" rid="B34">Nilsen et al., 2013</xref>; <xref ref-type="bibr" rid="B50">Tuminaro and Redish, 2007</xref>). Students are firstly introduced to such quantitative considerations in secondary physics teaching. They are taught about the principle of energy conservation in the context of mechanics and thermodynamics. In this context, they learn how to apply energy formulas to physics problems. This is a pivotal stage of secondary physics education, however, a declining interest and motivation for STEM subjects can be observed during that time (<xref ref-type="bibr" rid="B39">Potvin and Hasni, 2014</xref>; <xref ref-type="bibr" rid="B14">Frenzel et al., 2012</xref>).</p>
<p>The work presented in this article was done as part of a project that aims to develop a digital learning environment for training these important quantitative problem-solving skills. In order for that learning environment to be effectively adapted to the individual needs of the students, an automatic assessment of problem solving is essential (e.g., <xref ref-type="bibr" rid="B37">Plass and Pawar, 2020</xref>; <xref ref-type="bibr" rid="B25">Lee et al., 2024</xref>).</p>
<p>In general, a distinction is made between problem solving and routine exercises (e.g., <xref ref-type="bibr" rid="B45">Smith, 1991</xref>). This distinction does not implicate that a problem task is necessarily more difficult than a routine task, but it is made based on the necessary cognitive processes. A problem can be defined as a task where a defined beginning state has to be transformed into a desired end state without an immediately apparent solution path (e.g., <xref ref-type="bibr" rid="B27">Martinez, 1998</xref>; <xref ref-type="bibr" rid="B7">Csap&#x00F3; and Funke, 2017</xref>; <xref ref-type="bibr" rid="B10">D&#x00F6;rner and Funke, 2017</xref>). Solving a problem task requires active decision making on which concepts and principles to use (<xref ref-type="bibr" rid="B31">Mosier et al., 2018</xref>; <xref ref-type="bibr" rid="B40">Price et al., 2022</xref>). A routine exercise, in contrast, is a task where the solver knows from the beginning how to get to the solution. For that reason, it depends on the solver, whether a task poses a problem or a routine task for them (<xref ref-type="bibr" rid="B27">Martinez, 1998</xref>).</p>
<p>For decades, various models for problem-solving processes were defined. These models have in common that they divide problem-solving processes into different phases of subprocesses. For example, <xref ref-type="bibr" rid="B38">P&#x00F3;lya (1945)</xref> defined the following four steps of solving a mathematics problem: &#x201C;understand the problem,&#x201D; &#x201C;make a plan,&#x201D; &#x201C;carry out the plan,&#x201D; and &#x201C;look back.&#x201D; <xref ref-type="bibr" rid="B15">Friege (2001)</xref> defined four similar phases, that he called &#x201C;problem representation,&#x201D; &#x201C;development or selection of a problem scheme,&#x201D; &#x201C;elaboration of a solution,&#x201D; and &#x201C;evaluation of the solution.&#x201D; The PISA problem-solving assessment is based on a similar process model as well, although this model uses a slightly different division of the subprocesses. In the PISA model, there are the subprocesses &#x201C;exploring and understanding,&#x201D; &#x201C;representing and formulating,&#x201D; &#x201C;planning and executing,&#x201D; and &#x201C;monitoring and reflecting&#x201D; (<xref ref-type="bibr" rid="B41">Ramalingam et al., 2017</xref>). During the PISA assessment, students need to answer items including multiple choice, drag and drop, and written solution formats using computers (<xref ref-type="bibr" rid="B35">OECD, 2013</xref>).</p>
<p>The PISA problem-solving model has also been used for qualitative analysis of written problem solutions (<xref ref-type="bibr" rid="B22">Kelly et al., 2016</xref>). Such qualitative analyses are very close to the process under examination which enables valid and reliable assessment. They can be used to gather insights into the problem-solving process, e.g., to explore the sequential structure (<xref ref-type="bibr" rid="B48">Tschisgale et al., 2025</xref>) or to compare expert-like and novice-like problem solutions (<xref ref-type="bibr" rid="B8">Docktor et al., 2016</xref>). Rubrics like the Minnesota Assessment of Problem Solving (MAPS) can be used to quantify the results of qualitative analyses in order to make them accessible for comparisons to other assessments (<xref ref-type="bibr" rid="B9">Docktor et al., 2015</xref>).</p>
<p>For deeper insights into the cognitive processes of problem solvers, qualitative analyses of written problem solutions are frequently supplemented with verbal data like interviews or think aloud protocols (e.g., <xref ref-type="bibr" rid="B6">Chiu et al., 2022</xref>). In interview studies, the participants are asked to explain their solution after they wrote it down, whereas participants in think aloud studies are prompted to verbalize their thoughts during the process (<xref ref-type="bibr" rid="B13">Ericsson and Simon, 1993</xref>). These kinds of studies are considered the &#x201C;gold standard&#x201D; of problem-solving process analysis (<xref ref-type="bibr" rid="B8">Docktor et al., 2016</xref>), but they require a lot of effort, so it is difficult to assess large populations using these methodologies.</p>
<p>Another difficulty of qualitative assessments based on problem solutions is that students might produce incomplete data. For example, a student that fails to devise a plan for the problem solution will not get to the execution or evaluation phases. Therefore, the participant&#x2019;s skills in these subprocesses cannot be analyzed using their written solution. <xref ref-type="bibr" rid="B3">Brandenburger (2016)</xref> designed a test instrument that separately assesses the four subprocesses of problem solving. This approach can overcome the abovementioned difficulties of incomplete solutions, because the skills for the subprocesses are assessed independently. However, this approach does not assess the decision making necessary for problem solving, since the participants are prompted step-by-step to execute certain parts of the solution process. Also, the tests were conducted using pen and paper at the university level. There is currently no assessment tool available following this approach, that is applicable for digital learning environments on secondary school level and enables automatic evaluation.</p>
<p>For our project, we aim to develop a digital assessment tool, that separately and independently assesses secondary school students&#x2019; skills for the different subprocesses of problem solving. We call this an &#x201C;atomized&#x201D; approach. The research question for the presented validation study is: <italic>Can the sub-dimensional structure of problem-solving skills be assessed using an atomized test instrument?</italic></p>
</sec>
<sec id="S2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<p>A problem-solving test was designed as part of a digital learning environment about physics problem solving concerning the conservation of energy in secondary school lessons. For validation of this test, two consecutive studies were conducted. The first study served as a preliminary study for item design. It was conducted in 13 years 9 through 11 classes from three secondary schools (German &#x201C;Gymnasium&#x201D;). In total, 270 students participated of which 138 were males, 127 were females and six students with non-binary gender identity. The mean age was 15.9 (SD 3.2) years. The students answered one of eight test sets with eight items each. A total of 40 different items were examined in this study. The test sets were implemented in a web-based learning environment and answered using computers or tablets during a regular physics lesson. The results were quantitatively analyzed using a Rasch analysis and further examined qualitatively. The results from these analyses led to several changes that are explained in further detail in section &#x201C;2.1 Design of the problem-solving tasks.&#x201D;</p>
<p>The second study was conducted in 24 years 9 through 11 classes from 12 secondary schools [German &#x201C;Gymnasium&#x201D; and &#x201C;Gesamtschule (KGS)&#x201D;] with a total of 530 students (253 males, 252 females, 11 divers, 14 NA). The mean age was 16.3 (SD 0.9) years. In this study, the students answered a personal questionnaire, a problem-solving pretest, and an energy pretest. Then, they did a problem-solving training that is unrelated to this validation study. Thereafter, the students answered a problem-solving posttest and an energy posttest. The posttests were answered by 230 students. All of these instruments were implemented in a web-based learning environment and accessed using computers or tablets. A subsample of participants additionally solved a problem task using pen and paper during the lesson of the pretests or the lesson of the posttests. The students decided voluntarily if they wanted to solve this additional task. In total, 51 students (28 females and 23 males; age: 15.6 SD 1.1) participated in this supplemental assessment.</p>
<sec id="S2.SS1">
<label>2.1</label>
<title>Design of the problem-solving tasks</title>
<sec id="S2.SS1.SSS1">
<label>2.1.1</label>
<title>Initial item design based on theoretical considerations</title>
<p>At first, the items for the atomized problem-solving test (APST) were designed. For this purpose, the relevant official curriculum and commonly used school textbooks were examined. The subject area about quantitative energy considerations is taught between year 9 and 11 of lower saxony&#x2019;s secondary schools. During this teaching unit, students are supposed to learn the formulas for thermal energy, gravitational potential energy and kinetic energy. Besides understanding the interconnections of these formulas, students begin to engage quantitively with physics problem solving concerning simple mechanics and thermodynamics (<xref ref-type="bibr" rid="B30">Ministry of School and Culture Lower Saxony, 2015</xref>). Based on these insights, various problem tasks were designed and discussed with colleagues of our institute. In the end, eight problem tasks were determined.</p>
<p>These eight problems were subsequently &#x201C;atomized,&#x201D; meaning that each problem task was divided into four sub-tasks concerning the phases of problem-solving processes. Accordingly, for each problem there are tasks for <italic>Representation</italic>, <italic>Planning</italic>, <italic>Execution</italic>, and <italic>Evaluation</italic>. <xref ref-type="fig" rid="F1">Figure 1</xref> illustrates the resulting structure of the APST. The complete problem solution is shown in the center of the figure and is divided into the four subprocesses of problem solving. The different APST atoms, displayed on either side of the figure, contain the problem situation together with an increasing number of subprocess solutions. For example, the <italic>Representation</italic> atom includes only the problem situation and the <italic>Representation</italic> item, whereas the <italic>Evaluation</italic> atom incorporates the solutions from the <italic>Representation</italic>, <italic>Planning</italic>, and <italic>Execution</italic> subprocesses in addition to the <italic>Evaluation</italic> item.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>Schematic figure of the &#x201C;atomization&#x201D; of a problem for the atomized&#x201D; problem-solving test (APST).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="feduc-11-1759878-g001.tif">
<alt-text content-type="machine-generated">Diagram of the Atomized Problem-Solving Test. A central panel labeled &#x201C;Problem and Solution&#x201D; contains four stacked stages: Representation, Planning, Execution, and Evaluation, all under a shared &#x201C;Problem Situation.&#x201D; Four surrounding panels represent the Representation Atom, Planning Atom, Execution Atom, and Evaluation Atom. Each atom begins with &#x201C;Problem Situation&#x201D; and includes selected stages from the central process, with one highlighted item in each: R-Item in the Representation Atom, P-Item in the Planning Atom, DS-Item in the Execution Atom, and E-Item in the Evaluation Atom. Arrows connect the central panel to each atom, indicating the relationship between the overall problem-solving process and its atomized components.</alt-text>
</graphic>
</fig>
<p>For designing these sub-tasks, didactical considerations were balanced against technical constraints. The goal was to design tasks, that reliably evaluate the phases of problem-solving processes, are digitally accessible without requiring special equipment, and can be assessed automatically. Task formats were inspired by the test items of <xref ref-type="bibr" rid="B3">Brandenburger (2016)</xref>, who used a similar approach for assessing problem solving. But, since her test was used for assessment at the university level and it was conducted using pen and paper, major adjustments were necessary to design a digital test for secondary school level. The resulting task formats are summarized in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Item types for the atomized&#x201D; problem-solving test (APST) tested in study 1 and study 2.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Item type</th>
<th valign="top" align="left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" colspan="2"><bold>Study 1</bold></td>
</tr>
<tr>
<td valign="top" align="left">Representation (R)</td>
<td valign="top" align="left">Multiple choice with drawings as options</td>
</tr>
<tr>
<td valign="top" align="left">Planning (P)</td>
<td valign="top" align="left">Always two sub-items<break/> PX_1: multiple choice with written plans as options<break/> PX_2: multiple choice with formulas</td>
</tr>
<tr>
<td valign="top" align="left">Execution (D)</td>
<td valign="top" align="left">Freely written equations with step-by-step instructions</td>
</tr>
<tr>
<td valign="top" align="left">Self-explanation (S)</td>
<td valign="top" align="left">Complete problem solution that the students need to explain in a short text</td>
</tr>
<tr>
<td valign="top" align="left">Evaluation (E)</td>
<td valign="top" align="left">One or two questions about a complete problem solution</td>
</tr>
<tr>
<td valign="top" align="left" colspan="2"><bold>Study 2</bold></td>
</tr>
<tr>
<td valign="top" align="left">Representation (R)</td>
<td valign="top" align="left">Multiple choice with drawings as options</td>
</tr>
<tr>
<td valign="top" align="left">Planning (P)</td>
<td valign="top" align="left">Always two sub-items<break/> PX_1: multiple choice with written plans as options<break/> PX_2: multiple choice with equations as options (only one correct)</td>
</tr>
<tr>
<td valign="top" align="left">Execution (DS)</td>
<td valign="top" align="left">Problem solutions with omissions that the students fill out with equations</td>
</tr>
<tr>
<td valign="top" align="left">Evaluation (E)</td>
<td valign="top" align="left">One or two questions about a complete problem solution</td>
</tr>
</tbody>
</table></table-wrap>
<p>For the <italic>Representation</italic> phase, students had to select the correct drawing out of four options. Each option accurately depicted the problem situation, but the given and searched quantities were manipulated. A visual representation with a drawing is commonly used in physics problem solving. From a technical perspective, it is difficult to automatically assess students&#x2019; drawings. For that reason, the multiple choice (MC) item with different drawings was considered a feasible alternative.</p>
<p>The items for the Planning phase already contained a correct drawing as a visual representation of the problem situation. This was done, because the items are supposed to assess precisely one subprocess of problem solving independently from the other subprocesses. With providing a correct drawing as a visual representation, students&#x2019; success in the Planning items is less influenced by their representation skills. The students had to choose an appropriate approach for solving the problem in two MC questions. The options of the first MC task were short texts explaining approaches. In these questions, the students were supposed to identify the physics concepts and interconnections relevant for the problem. The MC format was used for this question, since formulating an approach with its quantitative manipulations without actually performing them is an unfamiliar task for students. It requires a lot of creativity and mental flexibility that average students might not be capable of (<xref ref-type="bibr" rid="B49">Tschisgale et al., 2023</xref>). The second MC question contained various equations and the students should decide which of them are useful for following the selected approach. The Planning items are designed similarly to the test instrument of <xref ref-type="bibr" rid="B3">Brandenburger (2016)</xref>.</p>
<p>The Execution phase was evaluated using two task types. In both task types the students were provided with a correct visual representation and a written plan appropriate for solving the problem. In the first type of execution items (D-items), students should follow step-by-step instructions to perform the mathematical operations for solving the problem. This type of item focused on mathematical competencies with the step-by-step instructions providing the plan for the problem solution. Based on the considerations of <xref ref-type="bibr" rid="B38">P&#x00F3;lya (1945)</xref>, the &#x201C;carry out the plan&#x201D; phase is characterized by a finite set of operations that the problem solver knows to be necessary. Therefore, that phase becomes similar to solving a routine exercise. In the other type of items for this phase (S-items), the students were provided with a complete mathematical solution without any comments and were then prompted to explain the given solution. This kind of explanations was derived from self-explanations (<xref ref-type="bibr" rid="B5">Chi et al., 1989</xref>) that are known to be productive in the context of worked examples (<xref ref-type="bibr" rid="B1">Atkinson et al., 2000</xref>; <xref ref-type="bibr" rid="B11">Dudzinska, 2020</xref>; <xref ref-type="bibr" rid="B18">Hilbert et al., 2008</xref>). The self-explanation items were designed to assess the understanding of a given problem solution.</p>
<p>For the <italic>Evaluation</italic> items, students were provided with a complete problem solution containing a drawing and explaining comments. Below the solution, they were asked questions about the model-like considerations that were used within this solution. For example, the students were asked, how the result would be different under real-world conditions in order to reflect on assumptions that were made for the given solution. As another example, there are questions proposing changes to the problem situation like a hill, that a car runs down, being twice as high. The students were then asked to estimate the new result without calculating, but arguing physically. The students needed to answer these questions in short written texts. The idea of this kind of evaluation task was to assess whether the students are able to identify, understand, and explain certain model-like considerations that are frequently used in this kind of problem solving. This is considered to be the necessary skill for evaluating whether a problem solution is appropriate in a given situation or not.</p>
</sec>
<sec id="S2.SS1.SSS2">
<label>2.1.2</label>
<title>Item revision based on empirical results from the preliminary study</title>
<p>The results of 270 students each answering eight items (two representation tasks, two planning tasks, one D-item, one S-item, and two evaluation tasks) were used for analyses. This was only a preliminary study and there were some issues with the overlap of items in the different test sets. For that reason, the item parameters were simply examined using descriptive statistics like the proportion of correct solutions and a basic one-dimensional Rasch analysis. Additionally, the students&#x2019; answers were inspected qualitatively guided by item parameters like infit values that were obtained from the Rasch analysis (e.g., <xref ref-type="bibr" rid="B32">Neumann, 2014</xref>). As a brief summary, these analyses indicated the following revisions to be appropriate.</p>
<p>The <italic>Representation</italic> items were answered correctly by between 75 % and 90 % of the participants, so they seem to be relatively easy. However, besides that, they seem to adequately fit the test instrument, because the infit values do not indicate problems for ability estimation using these items. The infits of the items are between 0.77 and 1.16 with an area of 0.8&#x2013;1.2 being acceptable (<xref ref-type="bibr" rid="B2">Bond and Fox, 2007</xref>). As a result, the <italic>Representation</italic> items were not revised fundamentally, but the most difficult items were selected and minor changes were made to the quality of the drawings.</p>
<p>The first part of the <italic>Planning</italic> items, where the students select the written approaches, seems to be adequate as well. The ratios of correct solutions for these subitems are between 20 % and 36 % and the infits between 0.80 and 1.09. For the second type of subitem, where the students had to choose equations, it was noticeable that the students often only chose one equation even though multiple equations were correct and useful. Even with student answers being rated as correct, that only chose useful equations but not necessarily all of them, the correct answer ratios are between 14% and 40%. The infits are between 0.79 and 1.25. For that reason, the <italic>Planning</italic> items were slightly amended so that in the second MC question, there are now equations that mathematically represent the selected approach as options. In this amended version, there is always exactly one correct option.</p>
<p>The most substantial changes were made for the items assessing the <italic>Execution</italic> phase. The infits of the D-items with step-by-step instructions were relatively low between 0.69 and 0.92. Also, these items were only answered correctly by between 5% and 29% of the participants, so they seem to be difficult for students. The qualitative analysis while rating the students&#x2019; answers to these items revealed that the students did not follow the instructions closely enough. This led to difficulties for valid rating, because many students solved the problem, but - strictly speaking - they did not answer the prompts for the separate steps. A similar observation was made for the self-explanations in the S-items. Here, the students rather described the formulas line by line without explaining the physical meanings or plans behind them. Therefore, many student answers were not wrong, but they also did not meet the expectations. As a result, the self-explanation items were excluded completely from the APST. The D-items with the step-by-step instructions were changed to a format similar to an uncomplete worked example (e.g., <xref ref-type="bibr" rid="B1">Atkinson et al., 2000</xref>; <xref ref-type="bibr" rid="B18">Hilbert et al., 2008</xref>). In these new DS-items assessing the <italic>Execution</italic> phase, the students are provided with a problem description, a correct drawing of the problem situation, and a commented solution with omitted equations that the students are prompted to add.</p>
<p>The <italic>Evaluation</italic> items appeared to be adequate. With correct answer ratios between 16% and 49% they are rather difficult, but also cover a wide range of difficulties. Infit values between 0.78 and 1.14 indicate no major issues for the Rasch model. The qualitative analysis of students&#x2019; answers revealed a large variety of correct and incorrect evaluations. Especially the incorrect answers revealed interestingly precise which students had misconceptions about the conservation of energy and assumptions that are frequently made for school-like problem solving. Thus, the <italic>Evaluation</italic> items were not changed fundamentally and the best fitting items were used for the further validation.</p>
<p>In addition to the described improvements of existing items regarding the item types, four new problems were designed and &#x201C;atomized.&#x201D; Subsequently, two test sets each containing eight items (two per item type) were designed guided by the item parameters from the Rasch analysis. These test sets were used for the second study. The reworked APST was supplemented by the following assessments for further validation.</p>
</sec>
</sec>
<sec id="S2.SS2">
<label>2.2</label>
<title>Qualitative analysis of written problem solutions</title>
<p>The problem-solving skills of a subsample of participants were also assessed using complete written solutions to a physics energy problem. In the problem task, the students were supposed to determine whether a football can be kicked over a fence. For solving the problem, the students needed to use the principle of energy conservation and the formulas for kinetic energy and gravitational potential energy.</p>
<p>The written problem solutions were analyzed by two coders using the Minnesota Assessment of Problem-Solving (MAPS) rubric (<xref ref-type="bibr" rid="B8">Docktor et al., 2016</xref>). The MAPS rubric has five different categories: <italic>useful description</italic>, <italic>physics approach</italic>, <italic>specific application of physics</italic>, <italic>mathematical procedures</italic>, and <italic>logical progression</italic>. For all of the categories, a score between zero and five is assigned to the solution. The scores are ordinally scaled with five being the best score indicating expert-like skills regarding the category. Additionally, the coders rated every student&#x2019;s solution as either solved correctly or not solved (correctly) leading to an additional score of zero or one for the problem solution. The two coders discussed cases with differing scores and defined a consensus rating.</p>
<p>For this article, the quantitative correlations between MAPS scores and APST results are analyzed. Further details on problem design and the qualitative analyses of the written solutions are reported in (<xref ref-type="bibr" rid="B29">Meyer et al., 2025b</xref>).</p>
</sec>
<sec id="S2.SS3">
<label>2.3</label>
<title>Energy test</title>
<p>Conceptual knowledge is known to influence the problem-solving skills for domain-specific problems (e.g., <xref ref-type="bibr" rid="B16">Friege and Lind, 2006</xref>). Since the presented assessment tool focuses on quantitative energy problems, the conceptual knowledge about energy was assessed. For this purpose, we used an energy test that has been used in multiple projects of our institution before (e.g., <xref ref-type="bibr" rid="B11">Dudzinska, 2020</xref>). It consists of 20 multiple choice items based on the energy concept assessment (ECA) (<xref ref-type="bibr" rid="B33">Neumann et al., 2013</xref>). The items cover four conceptions of energy: <italic>forms</italic>, <italic>transformation</italic>, <italic>conservation</italic>, and <italic>degradation</italic>. These conceptions are known to form a learning progression for energy in secondary physics education (<xref ref-type="bibr" rid="B12">Duit, 2014</xref>; <xref ref-type="bibr" rid="B33">Neumann et al., 2013</xref>). The participants of this study are supposed to have qualitatively learned the <italic>forms</italic> and <italic>transformation</italic> conceptions, and are in the process of learning quantitative aspects of the energy concept like <italic>conservation</italic> and <italic>degradation</italic>.</p>
<p>For this study, we chose five items for each of the four conceptions, ranging from relatively easy to relatively difficult items based on Rasch analyses that were conducted by <xref ref-type="bibr" rid="B33">Neumann et al. (2013)</xref> and within our institution. In total, the energy test items are validated using answers from more than 2,000 students.</p>
</sec>
<sec id="S2.SS4">
<label>2.4</label>
<title>Personal questionnaire</title>
<p>Because the studies were conducted completely anonymously, the students answered a questionnaire to provide us with personal information. In the questionnaire, the students are asked about their age, which gender they identify with, and if German was their mother tongue. Additionally, they were asked about their school career: what type of school they visit, which year they are in, and what their last grades in physics, mathematics, and German were.</p>
</sec>
<sec id="S2.SS5">
<label>2.5</label>
<title>Data analysis</title>
<p>Different procedures were used for the quantitative analysis of the APST items. At first, data screening was done using descriptive statistics. Subsequently, a multidimensional item response theory (MIRT) analysis was applied using the R-library &#x201C;mirt&#x201D; (<xref ref-type="bibr" rid="B4">Chalmers, 2012</xref>). Item response theory (IRT) is a class of various statistical models that can be used to estimate the probability of a specific response pattern based on a latent trait of the item and the person answering it (<xref ref-type="bibr" rid="B2">Bond and Fox, 2007</xref>). In the case of educational assessment, IRT is mostly used to determine the likelihood of a student answering an item correctly based on their skill (latent trait of the person) and the item&#x2019;s difficulty (latent trait of the item) (<xref ref-type="bibr" rid="B44">Rost, 2004</xref>). An advantage of IRT, compared to classical test theory is, that the estimated item parameters are independent from the population they were based on (<xref ref-type="bibr" rid="B2">Bond and Fox, 2007</xref>). For that reason, an IRT-validated assessment tool can be utilized to analyze person abilities in various populations. In the case of MIRT, the item parameters and person abilities are estimated on multiple dimensions.</p>
<p>A <italic>common-item equating to a calibrated pool design</italic> was used for the MIRT analyses, meaning that the pretest and posttest results from study two were combined in order to place all items on a common scale (<xref ref-type="bibr" rid="B23">Kolen and Brennan, 2014</xref>, pp. 215&#x2013;219). For that purpose, a first model was calculated using only the common-items that were part of the pretest and the posttest. The item parameters from this model were analyzed using differential item functioning (DIF) to examine if there are significant differences between the calculated item parameters in the pretest group and the posttest group. If there are significant differences for an item, this item is not suitable as common-item, because it is unstable. Afterwards, all of the items are used in a grouped model with the item parameters of the stable common-items fixed (fixed parameter calibration) (<xref ref-type="bibr" rid="B23">Kolen and Brennan, 2014</xref>, pp. 182&#x2013;183).</p>
<p>Since the basic dimensionality of the instrument was defined by the four phases of problem-solving during the item design, no exploratory factor analysis (EFA) was conducted before the MIRT. A four-dimensional model following the types of items with pairwise covariances was assumed and a 2-parameter logistic (2PL) model (<xref ref-type="bibr" rid="B17">Hambleton and Swaminathan, 1985</xref>) was calculated using the quasi-monte-carlo expectation-maximization (QMCEM) algorithm. In a 2PL model, two item parameters are estimated: item difficulty and discrimination. The MIRT model syntax is accessible in the <xref ref-type="supplementary-material" rid="DS1">Supplementary Datasheet 1</xref>.</p>
<p>After MIRT modeling, a confirmatory factor analysis (CFA) was performed using the &#x201C;lavaan&#x201D; package (<xref ref-type="bibr" rid="B43">Rosseel, 2012</xref>) in R to examine the factor structure in further detail. The exact model selection can be an important aspect of an instrument&#x2019;s empirical validation (<xref ref-type="bibr" rid="B20">Immekus et al., 2019</xref>). Three different model structures were tested: correlated factors, higher-order, and bifactor. As a commonality, these model structures assume that the four dimensions of the instrument are distinct, yet related. In a correlated factors model, the factors are simply correlated. In a higher-order model, there is one predominant factor (e.g., problem solving) and the four distinct factors (e.g., representation, planning, execution, and evaluation) are subdimensions of this overriding factor. Following this structure, every item would measure one of the four mentioned subprocesses and together these subprocesses are combined to the overall problem-solving process. In a bifactor model, every item loads on one of the four distinct dimensions and additionally on one primary factor. This can be interpreted as every item simultaneously measuring one of the subprocesses and problem solving in general.</p>
<p>For quantitative validation of the presented instrument, the person ability scores from the MIRT analysis were utilized. Person ability scores are a metric scale of a person&#x2019;s ability for each of the four dimensions. Since the MIRT analysis does not provide an ability score for problem solving in general, the sum of correct items in the APST was used. It was then analyzed whether the four subdimension skills or the total APST scores correlate with the conceptual knowledge about energy, the school grades, and MAPS scores. Correlations were calculated using the &#x201C;psych&#x201D; package (<xref ref-type="bibr" rid="B42">Revelle, 2007</xref>) in R.</p>
</sec>
</sec>
<sec id="S3" sec-type="results">
<label>3</label>
<title>Results</title>
<p>The proportion of students that answered an item correctly and the distribution of total scores in the APST were used as descriptive statistics. The total scores in the pretest are normally distributed with a mean score of 5.9 (SD 2.9) out of 12 correct items per student. The ratios of correct solutions per item are widely spread between 19% (item DS8) and 81% (item R5). Usually, it is recommended to aim for a ratio between 20% and 80% when designing an assessment tool (<xref ref-type="bibr" rid="B44">Rost, 2004</xref>). Items with correct answer ratios below 20% might be too difficult and items that are answered correctly by more than 80% of participants are potentially too easy.</p>
<p>The test sets contained four common-items: R5 for the <italic>Representation</italic> phase, P4 (P4_1+P4_2) for the <italic>Planning</italic> phase, DS7 (DS7_1+DS7_2) for the <italic>Execution</italic> phase, and E6 for the <italic>Evaluation</italic> phase. These items were used for a MIRT model using grouped pretest and posttest data. The DIF analysis revealed significant differences between the item parameters in the pre-test group and the post-test group for item DS7 (DS7_1: <italic>p</italic> &#x003C; 0.01; DS7_2: <italic>p</italic> &#x003C; 0.05), meaning that the item parameters for item DS7 could not be estimated reliably. For that reason, DS7 cannot be considered a stable common item. For the other items, no significant differences for the item parameters were detected (<italic>p</italic> &#x003E; 0.05). As a result, the items R5, P4, and E6 were fixed as stable common-items for fixed parameter calibration.</p>
<p>Subsequently, a MIRT model including all items was estimated. Infit statistics were used to guide a qualitative analysis of the items and students&#x2019; answers like in the preliminary study 1. Most items showed acceptable fit; however, item P4 did not conform to the model. Both subitems of P4 exhibited infit values of approximately 0.4.</p>
<p>In MIRT analyses, infit values between 0.5 and 1.5 are generally considered productive for measurement, whereas values below 0.5 indicate limited contribution to the assessment and values above 2.0 are regarded as degrading (<xref ref-type="bibr" rid="B26">Linacre, 2002</xref>). The low infit values observed for items P4_1 and P4_2 therefore suggest that the items contribute little information to the model.</p>
<p>A qualitative analysis of the corresponding problem task supports this interpretation. The task underlying P4 required students to calculate the energy needed by a crane to lift a weight. In its atomized form, the solution plan consisted solely of computing the gravitational potential energy of the load and adding a given amount of degraded energy. This presents a routine exercise rather than a problem task for the target population. Consequently, the items P4_1 and P4_2 did not adequately align with the intended construct of problem-solving subprocesses and were therefore excluded from the final MIRT model.</p>
<p>The item DS7_1 exhibited an even lower infit value of 0.1, indicating very limited contribution to the assessment. However, the qualitative analysis of this item did not reveal an obvious substantive reason for its lack of productivity. Nevertheless, re-estimating the model without DS7_1 resulted in a substantial improvement of the global fit indices. This finding indicates that DS7_1 adversely affected parameter estimation. On this basis, it was classified as a harmful item for measurement and excluded from the APST.</p>
<p>The final MIRT model was calculated using grouped pre-test and post-test data and the common-items R5 and E6. This model converged normally within 0.0001 tolerance after 373 QMCEM iterations. The model fit parameters (log-likelihood = &#x2212;3576.6; AIC = 7241.1; BIC = 7445.0) were the lowest compared to various alternative models that were estimated during the analyses and thus this model is the best fit for the data. Comparing different models in order to analyze the dimensionality is a commonly used procedure in IRT studies (e.g., <xref ref-type="bibr" rid="B51">Wu and Adams, 2006</xref>; <xref ref-type="bibr" rid="B52">Z&#x00F6;ttl et al., 2011</xref>). It is noteworthy that these fit parameters can only be utilized for comparing different model specifications, but they are not useful for absolute argumentations.</p>
<sec id="S3.SS1">
<label>3.1</label>
<title>Factor structure</title>
<p>The MIRT model was estimated using a four-dimensional model structure based on the four phases of problem-solving that inspired the item design. Analyses of pairwise covariances between all of the four factors were enabled. The factor loadings of the items to the postulated subdimensions are between 0.47 and 0.92 which can be interpreted as moderate to high factor loadings. The proportional variances of the four subdimensions are between 9% for the <italic>Representation</italic> and 17% for the <italic>Execution</italic> with a total of 55% of variance being explained. The factor correlations are mostly moderate between 0.20 (R&#x223C;P) and 0.89 (D&#x223C;E). In summary, the MIRT analyses supported a four-dimensional structure. Four further analyses, confirmatory factor analyses (CFA) were executed like explained in the methods section.</p>
<p>The CFA for the bifactor model did not converge and was therefore found inappropriate for the test structure. The CFA of the correlated factors model [&#x03C7;<sup>2</sup>(98) = 104.8; <italic>p</italic> &#x003E; 0.3; CFI = 0.99; TLI = 0.99; RMSEA = 0.01] and the higher-order model [&#x03C7;<sup>2</sup>(100) = 106.7; <italic>p</italic> &#x003E; 0.3; CFI = 0.99; TLI = 0.99; RMSEA = 0.01] converged with good global fit indices (<xref ref-type="bibr" rid="B19">Hu and Bentler, 1999</xref>). A Chi-Squared-difference test showed no significant difference in the global fit of these two models [&#x0394;&#x03C7;<sup>2</sup>(2) = 2.84; <italic>p</italic> &#x003E; 0.2] and the AIC and BIC parameters are nearly identical. So, in regard to the global fit, the correlated factors model and the higher-order model are equally suitable for the APST subdimensions.</p>
<p>As of the local fit, the higher-order model shows no significant factor loading from the <italic>Evaluation</italic> factor to the higher-order factor <italic>Problem Solving</italic> (&#x03B2; = 0.02; <italic>p</italic> &#x003E; 0.9). This indicates that the four subdimensions of the APST do not seem to load on a single-dimensional higher-order factor <italic>Problem Solving</italic>. The <italic>Representation</italic>, <italic>Planning</italic>, and <italic>Execution</italic> items can be summarized by one higher-order factor. However, the <italic>Evaluation</italic> variable cannot be explained by the same higher-order factor. Additionally, the correlation between the higher-order factor <italic>Problem Solving</italic> and the subdimension <italic>Evaluation</italic> is very high (&#x03C1; = 0.99; <italic>p</italic> &#x003C; 0.001) and the variance of the <italic>Evaluation</italic> subdimension without <italic>Problem Solving</italic> is less than 0.02. This indicates that these two factors seem to assess nearly the same construct and that <italic>Evaluation</italic> skills cannot be separated from the higher-order factor <italic>Problem Solving</italic> in the APST.</p>
<p>In the correlated factors model, all of the APST dimensions show significant covariances. Especially, the covariance, between <italic>Execution</italic> and <italic>Evaluation</italic> is very strong (&#x03C1; = 0.8; <italic>p</italic> &#x003C; 0.001). The covariances of <italic>Representation</italic> and <italic>Planning</italic> as well as of <italic>Planning</italic> and <italic>Execution</italic> are weak (0.1 &#x003C; &#x03C1; &#x003C; 0.3; <italic>p</italic> &#x003C; 0.05). The other covariances are moderate (0.3 &#x003C; &#x03C1; &#x003C; 0.5; <italic>p</italic> &#x003C; .01).</p>
</sec>
<sec id="S3.SS2">
<label>3.2</label>
<title>Item parameters</title>
<p>As described above, the infit values for every item were used as a guide for qualitative analyses and led to the exclusion of the items P4_1, P4_2, and DS7_1. In the final MIRT model, most of the remaining items showed infit values between 0.5 and 1.0 which is well within the accepted range of 0.5&#x2013;1.5 (<xref ref-type="bibr" rid="B26">Linacre, 2002</xref>). Only the <italic>Planning</italic> item P10_1 is slightly out of this range with an infit of 0.46. This item might be mildly unproductive, but lower infit values do not indicate degrading items. Since the qualitative analysis of item P10 did not reveal any misfitting content, it was left in the test set. The MIRT analysis generated difficulty and discrimination parameters for each item (see <xref ref-type="table" rid="T2">Table 2</xref>). The discrimination parameters are divided into the four dimensions of the APST, so every type of item discriminates on a different scale. The discrimination values vary between 0.90 and 3.11. The <italic>Representation</italic> and the <italic>Execution</italic> items (DS-items) show more homogeneous discrimination parameters between 1.4 and 2.0, whereas the <italic>Evaluation</italic> items show the greatest variety. The <italic>Planning</italic> items show overall relatively high discriminations between 1.93 and 3.07. This is a considerable variation in the item discrimination.</p>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Item parameters of the multidimensional item response theory (MIRT) analysis: item discriminations for the four subdimensions and item difficulties.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Item</th>
<th valign="top" align="left">Representation</th>
<th valign="top" align="left">Planning</th>
<th valign="top" align="left">Execution</th>
<th valign="top" align="left">Evaluation</th>
<th valign="top" align="left">Difficulty</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">R1</td>
<td valign="top" align="left">1.71</td>
<td valign="top" align="left" rowspan="3"/>
<td valign="top" align="left" rowspan="7"/>
<td valign="top" align="left" rowspan="11"/>
<td valign="top" align="left">&#x2212;1.24</td>
</tr>
<tr>
<td valign="top" align="left">R5</td>
<td valign="top" align="left">1.74</td>
<td valign="top" align="left">&#x2212;1.50</td>
</tr>
<tr>
<td valign="top" align="left">R9</td>
<td valign="top" align="left">1.6</td>
<td valign="top" align="left">&#x2212;1.09</td>
</tr>
<tr>
<td valign="top" align="left">P3_1</td>
<td valign="top" align="left" rowspan="13"/>
<td valign="top" align="left">2.55</td>
<td valign="top" align="left">0.01</td>
</tr>
<tr>
<td valign="top" align="left">P3_2</td>
<td valign="top" align="left">1.93</td>
<td valign="top" align="left">&#x2212;0.26</td>
</tr>
<tr>
<td valign="top" align="left">P10_1</td>
<td valign="top" align="left">3.07</td>
<td valign="top" align="left">0.7</td>
</tr>
<tr>
<td valign="top" align="left">P10_2</td>
<td valign="top" align="left">2.41</td>
<td valign="top" align="left">0.71</td>
</tr>
<tr>
<td valign="top" align="left">DS7_2</td>
<td valign="top" align="left" rowspan="9"/>
<td valign="top" align="left">1.44</td>
<td valign="top" align="left">&#x2212;0.08</td>
</tr>
<tr>
<td valign="top" align="left">DS8</td>
<td valign="top" align="left">1.35</td>
<td valign="top" align="left">1.44</td>
</tr>
<tr>
<td valign="top" align="left">DS11_1</td>
<td valign="top" align="left">1.87</td>
<td valign="top" align="left">&#x2212;0.76</td>
</tr>
<tr>
<td valign="top" align="left">DS11_2</td>
<td valign="top" align="left">2.10</td>
<td valign="top" align="left">&#x2212;0.21</td>
</tr>
<tr>
<td valign="top" align="left">E2_1</td>
<td valign="top" align="left" rowspan="5"/>
<td valign="top" align="left">3.11</td>
<td valign="top" align="left">0.12</td>
</tr>
<tr>
<td valign="top" align="left">E2_2</td>
<td valign="top" align="left">0.99</td>
<td valign="top" align="left">0.33</td>
</tr>
<tr>
<td valign="top" align="left">E6</td>
<td valign="top" align="left">0.90</td>
<td valign="top" align="left">&#x2212;0.83</td>
</tr>
<tr>
<td valign="top" align="left">E12_1</td>
<td valign="top" align="left">1.83</td>
<td valign="top" align="left">1.11</td>
</tr>
<tr>
<td valign="top" align="left">E12_2</td>
<td valign="top" align="left">1.46</td>
<td valign="top" align="left">&#x2212;0.37</td>
</tr>
</tbody>
</table></table-wrap>
<p>Item difficulty in IRT analyses is a dimensionless, metric scale that allows to compare the difficulty of items within the same test. Like most of the other parameters, the difficulty parameters cannot be compared through different IRT models. The APST item difficulties vary in a range from &#x2212;1.50 to 1.44. The <italic>Representation</italic> items are on the easy end of that scale with difficulties between &#x2212;1.50 and &#x2212;1.09. The other item types show reasonable variation with some items being on the easier half of the scale and some items on the more difficult half of the scale. <italic>Evaluation</italic> items are the most difficult ones. The detailed results can be found in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
</sec>
<sec id="S3.SS3">
<label>3.3</label>
<title>Person parameters and correlation analyses</title>
<p>The MIRT model also estimated person ability scores for each participant for each of the four subdimensions of the APST. Since it was not possible to implement the higher-order structure into the MIRT analysis, the overall problem-solving skill can only be estimated using the number of correct items like in classical test theory. Correlations of the estimated person ability scores with the different covariates that were assessed during study two were analyzed.</p>
<p>The MAPS rubric was used as a second problem-solving assessment with a subsample of 51 participants. Unfortunately, 23 of these participants did not answer the APST appropriately, so their results could not be used for analyses. For that reason, only 28 results were used for this correlation analysis, so the results (see <xref ref-type="table" rid="T3">Table 3</xref>) are rather exploratory. Especially, the <italic>Evaluation</italic> and <italic>Execution</italic> items show significant, moderate to strong correlations with the MAPS scores.</p>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Spearman correlation matrix for atomized&#x201D; problem-solving test (APST) person abilities and Minnesota Assessment of Problem Solving (MAPS) scores.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center"><italic>n</italic> = 28</th>
<th valign="top" align="center">Solution</th>
<th valign="top" align="center">Description</th>
<th valign="top" align="center">Approach</th>
<th valign="top" align="center">Physics</th>
<th valign="top" align="center">Mathematics</th>
<th valign="top" align="center">Logic</th>
<th valign="top" align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Representation</td>
<td valign="top" align="center">&#x2212;0.08</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">&#x2212;0.11</td>
<td valign="top" align="center">&#x2212;0.17</td>
<td valign="top" align="center">0.03</td>
<td valign="top" align="center">&#x2212;0.19</td>
<td valign="top" align="center">&#x2212;0.16</td>
</tr>
<tr>
<td valign="top" align="center">Planning</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.12</td>
<td valign="top" align="center">0.18</td>
<td valign="top" align="center">0.28</td>
<td valign="top" align="center">0.14</td>
<td valign="top" align="center">0.21</td>
</tr>
<tr>
<td valign="top" align="center">Execution</td>
<td valign="top" align="center">0.43<xref ref-type="table-fn" rid="t3fns1">&#x002A;</xref></td>
<td valign="top" align="center">0.17</td>
<td valign="top" align="center">0.26</td>
<td valign="top" align="center">0.16</td>
<td valign="top" align="center">0.44<xref ref-type="table-fn" rid="t3fns1">&#x002A;</xref></td>
<td valign="top" align="center">0.19</td>
<td valign="top" align="center">0.31</td>
</tr>
<tr>
<td valign="top" align="center">Evaluation</td>
<td valign="top" align="center">0.57<xref ref-type="table-fn" rid="t3fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.25</td>
<td valign="top" align="center">0.39<xref ref-type="table-fn" rid="t3fns1">&#x002A;</xref></td>
<td valign="top" align="center">0.16</td>
<td valign="top" align="center">0.44<xref ref-type="table-fn" rid="t3fns1">&#x002A;</xref></td>
<td valign="top" align="center">0.28</td>
<td valign="top" align="center">0.38<xref ref-type="table-fn" rid="t3fns1">&#x002A;</xref></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="t3fns1"><p>&#x002A;: <italic>p</italic> &#x003C; 0.05;</p></fn>
<fn id="t3fns2"><p>&#x002A;&#x002A;: <italic>p</italic> &#x003C; 0.01.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>For the analyses of correlations between the energy test and APST as well as the information from the personal questionnaire and the APST, the sample size was <italic>n</italic> = 530. This makes the results (see <xref ref-type="table" rid="T4">Table 4</xref>) more reliable and all of the correlations are highly significant (<italic>p</italic> &#x003C; 0.01). All of the APST dimensions are moderately correlated with the total score of the energy test and the scores for the <italic>Conservation</italic> items from the energy test.</p>
<table-wrap position="float" id="T4">
<label>TABLE 4</label>
<caption><p>Spearman correlation matrix for atomized&#x201D; problem-solving test (APST) person abilities and energy test scores.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center"><italic>n</italic> = 530</th>
<th valign="top" align="center">Forms</th>
<th valign="top" align="center">Transformation</th>
<th valign="top" align="center">Conservation</th>
<th valign="top" align="center">Degradation</th>
<th valign="top" align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Representation</td>
<td valign="top" align="center">0.30<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.33<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.44<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.27<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.47<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
</tr>
<tr>
<td valign="top" align="center">Planning</td>
<td valign="top" align="center">0.26<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.27<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.32<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.22<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.37<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
</tr>
<tr>
<td valign="top" align="center">Execution</td>
<td valign="top" align="center">0.25<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.25<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.39<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.26<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.41<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
</tr>
<tr>
<td valign="top" align="center">Evaluation</td>
<td valign="top" align="center">0.31<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.33<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.43<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.30<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
<td valign="top" align="center">0.49<xref ref-type="table-fn" rid="t4fns2">&#x002A;&#x002A;</xref></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="t4fns2"><p>&#x002A;&#x002A;: <italic>p</italic> &#x003C; 0.01.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>The correlations between the APST dimensions and the reported school grades for mathematics and physics are weak to moderate and negative. In Germany, school grades range from 1 to 6 with 1 being the best possible grade. So, in that case, the negative correlation is actually a positive correlation, because participants with better grades do also score higher in the APST. The native language of the participants as well as their gender have no correlations to the APST results (&#x03C1; &#x003C; 0.1).</p>
</sec>
</sec>
<sec id="S4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>In this study, items for a digital and atomized problem-solving assessment, the APST, were developed. The final set of items, including sample solutions and estimated item parameters, is provided in the <xref ref-type="supplementary-material" rid="DS2">Supplementary Datasheet 2</xref>. The item design is grounded in theoretical models of problem-solving processes and operationalizes four subprocesses that recur across multiple frameworks (e.g., <xref ref-type="bibr" rid="B38">P&#x00F3;lya, 1945</xref>; <xref ref-type="bibr" rid="B15">Friege, 2001</xref>; <xref ref-type="bibr" rid="B41">Ramalingam et al., 2017</xref>).</p>
<p>With respect to physics content, the APST focuses on energy conservation in alignment with the relevant secondary-school curriculum (<xref ref-type="bibr" rid="B30">Ministry of School and Culture Lower Saxony, 2015</xref>). As participants in the present study were not necessarily instructed in quantitative problem solving across other physics domains, like forces, the instrument was deliberately restricted to this content area. In addition, the APST builds on established design principles of an existing test instrument for problem-solving skills (<xref ref-type="bibr" rid="B3">Brandenburger, 2016</xref>). Together, these arguments support the content validity of the APST.</p>
<p>To examine criterion validity, conceptual knowledge about energy and problem-solving skills were assessed using validated test instruments. In addition, school grades in mathematics and physics were collected as indicators of general skills concerning these subjects. Problem solving is known to be influenced by conceptual knowledge and the ability to apply this knowledge to unfamiliar situations (<xref ref-type="bibr" rid="B50">Tuminaro and Redish, 2007</xref>; <xref ref-type="bibr" rid="B36">OECD, 2023</xref>). Accordingly, conceptual knowledge can be expected to predict problem-solving performance (<xref ref-type="bibr" rid="B16">Friege and Lind, 2006</xref>).</p>
<p>The analyses revealed significant correlations between all APST subdimensions and all conceptions assessed in the energy test. In particular, the <italic>Conservation</italic> conception showed stronger associations with APST results than other conceptions of energy. This finding is theoretically plausible, as the APST was explicitly designed for a learning environment that emphasizes the quantitative application of the energy conservation principle.</p>
<p>Moreover, the <italic>Representation</italic> and <italic>Evaluation</italic> subdimensions show stronger correlations with conceptual knowledge than <italic>Planning</italic> and <italic>Execution</italic>. In the <italic>Representation</italic> phase, students are required to identify relevant physics concepts and principles within a problem context, which presupposes well-developed conceptual understanding. Similarly, evaluating a complete solution necessitates situating a model-like solution within broader physical considerations, again drawing on conceptual knowledge. In contrast, the <italic>Planning</italic> and <italic>Execution</italic> phases rely more strongly on procedural skills and problem schemes (<xref ref-type="bibr" rid="B15">Friege, 2001</xref>). <italic>Planning</italic> additionally requires abstract thinking and imagination, as students must anticipate a solution path without yet performing it (<xref ref-type="bibr" rid="B49">Tschisgale et al., 2023</xref>), whereas <italic>Execution</italic> is characterized by the mathematical processing of planned steps (<xref ref-type="bibr" rid="B38">P&#x00F3;lya, 1945</xref>). It is therefore expected that these phases depend less strongly on conceptual knowledge than <italic>Representation</italic> and <italic>Evaluation</italic>.</p>
<p>The MAPS rubric (<xref ref-type="bibr" rid="B8">Docktor et al., 2016</xref>) was employed as an external measure of problem-solving skills to explore its relationship with APST scores. However, the corresponding correlation analyses are substantially limited by the small number of participants available due to artifacts in the data collection process. Within these limitations, exploratory analyses indicate that MAPS scores, primarily the mathematics rubric and the problem-solving success, show tentative associations with the <italic>Execution</italic> and <italic>Evaluation</italic> subdimensions of the APST. In contrast, no statistically significant associations were observed for the <italic>Planning</italic> and <italic>Representation</italic> subdimensions.</p>
<p>For the <italic>Representation</italic> subdimension, a negative trend can be observed for the association with MAPS scores. This finding should be interpreted with particular caution. One possible explanation is that the R-items of the APST are comparatively easy and therefore answered correctly by a large proportion of participants, which may restrict variance. Alternatively, this trend may reflect differences in the assessment approaches of APST and MAPS rather than substantive differences in representation skills. In the MAPS assessment, students solve problems without explicit guidance in order to minimize interference with the problem-solving process (<xref ref-type="bibr" rid="B8">Docktor et al., 2016</xref>). As a result, students are not required to explicitly perform the representation phase. In cases where no drawing or other representation is produced but the solution process is otherwise correct, the highest MAPS score is still assigned. Consequently, high MAPS scores may occur even when representation skills are not directly assessed. In contrast, the APST explicitly requires all participants to engage in the representation phase as a core component of the assessment.</p>
<p>Given the limited statistical power of the analyses, the MAPS-based findings should be interpreted as preliminary rather than confirmatory. Further research is needed to adequately examine the associations between MAPS and APST.</p>
<p>School grades in mathematics and physics are correlated significantly with the APST scores, indicating that students with stronger overall skills tend to perform better in the APST. In contrast, no significant correlations were observed with grades in German, native language, or gender. The absence of these correlations suggests that no systematic language-related or gender-related bias was observed in the APST results.</p>
<p>The MIRT analyses provided support for the four-dimensional structure assumed in the test design. Model fit is acceptable, since the algorithms converged within the usual QMCEM tolerance of 0.0001 and the fit parameters are within acceptable ranges. Moderate to high factor loadings among items within each subdimension indicate that the items consistently assess their intended constructs. Together with the correlations between the four subdimensions, these findings support the assumption that the problem-solving subprocesses require distinct yet interrelated skills.</p>
<p>The CFA results further support a correlated factors structure of the APST items, indicating empirically related yet distinguishable dimensions. Although a higher-order factor can be statistically specified, its interpretation requires caution. In particular, the higher-order factor shows substantial redundancy with the <italic>Evaluation</italic> subdimension as indicated by the near-zero loading of <italic>Evaluation</italic> on the higher-order factor, the extremely high correlation between those two factors, and the resulting minimal residual variance. These findings suggest that the APST subdimensions do not constitute a general problem-solving construct.</p>
<p>From a conceptual perspective, general problem solving inherently involves decision making, such as selecting strategies, representations, and solutions (<xref ref-type="bibr" rid="B31">Mosier et al., 2018</xref>; <xref ref-type="bibr" rid="B40">Price et al., 2022</xref>). By atomizing the solution process and guiding participants step-by-step, the APST excludes such decision making from the assessment. Accordingly, the results from the higher-order factor model suggest that problem solving is more than the sum of its subprocesses.</p>
<p>On the other hand, these findings also highlight the special role of <italic>Evaluation</italic> within the problem-solving process. Evaluating a given solution requires understanding the problem situation, comprehending the underlying plan, and judging the appropriateness of the execution, in addition to evaluation-specific competencies. This interpretation is further supported by the observed correlations between the APST subdimensions, with <italic>Evaluation</italic> showing moderate associations with <italic>Representation</italic> (<italic>r</italic> = 0.31) and <italic>Planning</italic> (<italic>r</italic> = 0.36), and a strong association with <italic>Execution</italic> (<italic>r</italic> = 0.66). Given the focus on quantitative problem solving, the strong link between <italic>Execution</italic> and <italic>Evaluation</italic> is theoretically plausible, as procedural and mathematical skills play a central role in assessing solution correctness.</p>
<p>Taken together, these findings underscore the conceptual distinction between assessing subprocess skills and measuring problem-solving competence in a broader sense. While the APST provides detailed information about students&#x2019; proficiency in specific subprocess skills, the present results should not be interpreted as evidence for assessment of a general problem-solving factor. Further research is required to examine how the subprocess skills, particularly <italic>Evaluation</italic>, relate to general problem-solving literacy.</p>
<p>The item parameters of the MIRT analysis indicate that the APST items span a broad range of difficulty levels. An exception are the <italic>Representation</italic> items, which are consistently easy and exhibit ceiling effects, with up to 81% of correct responses. As a consequence, these items show limited sensitivity to individual differences in representation skills and contribute less to discrimination at higher ability levels. Understanding and representing the problem situation is often a comparatively straightforward subprocess that expert-like problem solvers tend to perform implicitly rather than explicitly (<xref ref-type="bibr" rid="B15">Friege, 2001</xref>). In addition, the APST in its current form does not require participants to independently construct a representation, as the multiple-choice format already provides response options. Both aspects likely reduce task difficulty and restrict variance in item responses.</p>
<p>Despite these limitations, the <italic>Representation</italic> items were retained in the current instrument because they operationalize a theoretically central phase of the problem-solving process and ensure coverage of all four subprocesses assumed in the underlying framework, while allowing for automated assessment. At the same time, the observed ceiling effects indicate that the present items are not optimal for assessing higher levels of representation skills. More complex problem situations or alternative task formats, such as prompting students to generate their own representations, may be necessary to increase sensitivity. Future technical developments may enable automatic assessment of such alternative task formats, for example through artificial intelligence (AI)-based analyses of student generated drawings (<xref ref-type="bibr" rid="B24">Lee and Zhai, 2025</xref>).</p>
<p>In conclusion, the presented study provides evidence that the APST can assess four theoretically grounded subprocesses of problem solving as described in established models (<xref ref-type="bibr" rid="B38">P&#x00F3;lya, 1945</xref>; <xref ref-type="bibr" rid="B15">Friege, 2001</xref>). By focusing on energy conservation problems, the APST is particularly suited for use in secondary physics education, where quantitative problem-solving skills are still developing. However, to what extend the assessed subprocess skills predict more general problem-solving literacy remains an open question, and the generalizability of the instrument to other physics concepts requires further investigation.</p>
<p>A key strength of the APST in its current form is its suitability for automatic assessment across all item types, e.g., using open-source AI for automatic feedback on the <italic>Evaluation</italic> items (<xref ref-type="bibr" rid="B28">Meyer et al., 2025a</xref>). This enables large scale summative assessments as well as formative assessments in everyday school teaching. However, similar to the test instrument proposed by <xref ref-type="bibr" rid="B3">Brandenburger (2016)</xref>, the APST does not require active decision making. Consequently, while it may inform students about their skills for different subprocesses of problem solving, the APST is not sufficient on its own to assess general problem-solving literacy. A combined use of atomized instruments such as the APST and assessments based on complete problem solutions, ideally supplemented by verbal data, may therefore offer a more comprehensive approach to evaluating problem-solving processes.</p>
</sec>
</body>
<back>
<sec id="S5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="S6" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Regionales Landesamt f&#x00FC;r Schule und Bildung (RLSB) Hannover [Approval-Nr. H1R.10-81402-(103/2024) and H1R.10-81402-(116/2023)]. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation in this study was provided by the participants&#x2019; legal guardians/next of kin.</p>
</sec>
<sec id="S7" sec-type="author-contributions">
<title>Author contributions</title>
<p>AM: Resources, Validation, Conceptualization, Writing &#x2013; review &#x0026; editing, Data curation, Methodology, Writing &#x2013; original draft, Formal analysis, Software, Funding acquisition, Visualization, Investigation, Project administration. GF: Supervision, Methodology, Writing &#x2013; review &#x0026; editing, Conceptualization, Resources, Project administration, Validation, Visualization.</p>
</sec>
<sec id="S9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="S10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. ChatGPT and DeepL were used as generative AI tools for language editing of the submitted article in order to improve readability and clearness. The final text was checked for factual accuracy.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="S11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="S12" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/feduc.2026.1759878/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/feduc.2026.1759878/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.pdf" id="DS1" mimetype="application/pdf"/>
<supplementary-material xlink:href="Data_Sheet_2.pdf" id="DS2" mimetype="application/pdf"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Atkinson</surname> <given-names>R. K.</given-names></name> <name><surname>Derry</surname> <given-names>S. J.</given-names></name> <name><surname>Renkl</surname> <given-names>A.</given-names></name> <name><surname>Wortham</surname> <given-names>D.</given-names></name></person-group> (<year>2000</year>). <article-title>Learning from examples: Instructional principles from the worked examples research.</article-title> <source><italic>Rev. Educ. Res.</italic></source> <volume>70</volume>:<fpage>181</fpage>. <pub-id pub-id-type="doi">10.2307/1170661</pub-id></mixed-citation></ref>
<ref id="B2"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bond</surname> <given-names>T. G.</given-names></name> <name><surname>Fox</surname> <given-names>C. M.</given-names></name></person-group> (<year>2007</year>). <source><italic>Applying the Rasch Model: Fundamental Measurement in the Human Sciences.</italic></source> <publisher-loc>Mahwah, NJ</publisher-loc>: <publisher-name>Lawrence Erlbaum Associates</publisher-name>.</mixed-citation></ref>
<ref id="B3"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Brandenburger</surname> <given-names>M.</given-names></name></person-group> (<year>2016</year>). <source><italic>Was Beeinflusst den Erfolg Beim Probleml&#x00F6;sen in der Physik? Eine Untersuchung mit Studierenden. [What influences success in problem-solving in physics? A study with students].</italic></source> <publisher-loc>Berlin</publisher-loc>: <publisher-name>Logos Verlag Berlin GmbH</publisher-name>. German</mixed-citation></ref>
<ref id="B4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chalmers</surname> <given-names>R. P.</given-names></name></person-group> (<year>2012</year>). <article-title>mirt: A multidimensional item response theory package for the R environment.</article-title> <source><italic>J. Stat. Soft.</italic></source> <volume>48</volume> <fpage>1</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v048.i06</pub-id></mixed-citation></ref>
<ref id="B5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chi</surname> <given-names>M. T.</given-names></name> <name><surname>Bassok</surname> <given-names>M.</given-names></name> <name><surname>Lewis</surname> <given-names>M. W.</given-names></name> <name><surname>Reimann</surname> <given-names>P.</given-names></name> <name><surname>Glaser</surname> <given-names>R.</given-names></name></person-group> (<year>1989</year>). <article-title>Self-Explanations: How students study and use examples in learning to solve problems.</article-title> <source><italic>Cogn. Sci.</italic></source> <volume>13</volume> <fpage>145</fpage>&#x2013;<lpage>182</lpage>. <pub-id pub-id-type="doi">10.1207/s15516709cog1302_1</pub-id></mixed-citation></ref>
<ref id="B6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chiu</surname> <given-names>B.</given-names></name> <name><surname>Randles</surname> <given-names>C.</given-names></name> <name><surname>Irby</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>Analyzing student problem-solving with MAtCH.</article-title> <source><italic>Front. Educ.</italic></source> <volume>6</volume>:<fpage>769042</fpage>. <pub-id pub-id-type="doi">10.3389/feduc.2021.769042</pub-id></mixed-citation></ref>
<ref id="B7"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Csap&#x00F3;</surname> <given-names>B.</given-names></name> <name><surname>Funke</surname> <given-names>J. eds</given-names></name></person-group> (<year>2017</year>). <source><italic>The Nature of Problem Solving.</italic></source> <publisher-loc>Paris</publisher-loc>: <publisher-name>OECD Publishing</publisher-name>.</mixed-citation></ref>
<ref id="B8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Docktor</surname> <given-names>J. L.</given-names></name> <name><surname>Dornfeld</surname> <given-names>J.</given-names></name> <name><surname>Frodermann</surname> <given-names>E.</given-names></name> <name><surname>Heller</surname> <given-names>K.</given-names></name> <name><surname>Hsu</surname> <given-names>L.</given-names></name> <name><surname>Jackson</surname> <given-names>K. A.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Assessing student written problem solutions: A problem-solving rubric with application to introductory physics.</article-title> <source><italic>Phys. Rev. Phys. Educ. Res.</italic></source> <volume>12</volume>:<fpage>010130</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevPhysEducRes.12.010130</pub-id></mixed-citation></ref>
<ref id="B9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Docktor</surname> <given-names>J. L.</given-names></name> <name><surname>Strand</surname> <given-names>N. E.</given-names></name> <name><surname>Mestre</surname> <given-names>J. P.</given-names></name> <name><surname>Ross</surname> <given-names>B. H.</given-names></name></person-group> (<year>2015</year>). <article-title>Conceptual problem solving in high school physics.</article-title> <source><italic>Phys. Rev. ST Phys. Educ. Res.</italic></source> <volume>11</volume>:<fpage>020106</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevSTPER.11.020106</pub-id></mixed-citation></ref>
<ref id="B10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>D&#x00F6;rner</surname> <given-names>D.</given-names></name> <name><surname>Funke</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>Complex problem solving: What it is and what it is not.</article-title> <source><italic>Front. Psychol.</italic></source> <volume>8</volume>:<fpage>1153</fpage>. <pub-id pub-id-type="doi">10.3389/fpsyg.2017.01153</pub-id> <pub-id pub-id-type="pmid">28744242</pub-id></mixed-citation></ref>
<ref id="B11"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Dudzinska</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <source><italic>Lernen mit Beispielaufgaben und Feedback im Physikunterricht der Sekundarstufe 1: Energieerhaltung zur L&#x00F6;sung von Aufgaben nutzen. [Learning with example problems and feedback in secondary school physics lessons: Using conservation of energy to solve problems].</italic></source> <publisher-loc>Berlin</publisher-loc>: <publisher-name>LOGOS Verlag</publisher-name>. German</mixed-citation></ref>
<ref id="B12"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Duit</surname> <given-names>R.</given-names></name></person-group> (<year>2014</year>). &#x201C;<article-title>Teaching and learning the physics energy concept</article-title>,&#x201D; in <source><italic>Teaching and Learning of Energy in K &#x2013; 12 Education</italic></source>, <role>eds</role> <person-group person-group-type="editor"><name><surname>Chen</surname> <given-names>R. F.</given-names></name> <name><surname>Eisenkraft</surname> <given-names>A.</given-names></name> <name><surname>Fortus</surname> <given-names>D.</given-names></name> <name><surname>Krajcik</surname> <given-names>J.</given-names></name> <name><surname>Neumann</surname> <given-names>K.</given-names></name> <name><surname>Nordine</surname> <given-names>J.</given-names></name><etal/></person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>67</fpage>&#x2013;<lpage>85</lpage>.</mixed-citation></ref>
<ref id="B13"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ericsson</surname> <given-names>K. A.</given-names></name> <name><surname>Simon</surname> <given-names>H. A.</given-names></name></person-group> (<year>1993</year>). <source><italic>Protocol Analysis.</italic></source> <publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>The MIT Press</publisher-name>.</mixed-citation></ref>
<ref id="B14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Frenzel</surname> <given-names>A. C.</given-names></name> <name><surname>Pekrun</surname> <given-names>R.</given-names></name> <name><surname>Dicke</surname> <given-names>A.-L.</given-names></name> <name><surname>Goetz</surname> <given-names>T.</given-names></name></person-group> (<year>2012</year>). <article-title>Beyond quantitative decline: Conceptual shifts in adolescents&#x2019; development of interest in mathematics.</article-title> <source><italic>Dev. Psychol.</italic></source> <volume>48</volume> <fpage>1069</fpage>&#x2013;<lpage>1082</lpage>. <pub-id pub-id-type="doi">10.1037/a0026895</pub-id> <pub-id pub-id-type="pmid">22288365</pub-id></mixed-citation></ref>
<ref id="B15"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Friege</surname> <given-names>G.</given-names></name></person-group> (<year>2001</year>). <source><italic>Wissen und Probleml&#x00F6;sen: Eine empirische Untersuchung des wissenszentrierten Probleml&#x00F6;sens im Gebiet der Elektrizit&#x00E4;tslehre auf der Grundlage des Experten-Novizen-Vergleichs. [Knowledge and problem solving: An empirical investigation of knowledge-centered problem solving in the field of electricity based on expert-novice comparison.].</italic></source> <publisher-loc>Berlin</publisher-loc>: <publisher-name>LOGOS Verlag</publisher-name>. German</mixed-citation></ref>
<ref id="B16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Friege</surname> <given-names>G.</given-names></name> <name><surname>Lind</surname> <given-names>G.</given-names></name></person-group> (<year>2006</year>). <article-title>Types and qualities of knowledge and their relations to problem solving in physics.</article-title> <source><italic>Int. J. Sci. Math. Educ.</italic></source> <volume>4</volume> <fpage>437</fpage>&#x2013;<lpage>465</lpage>. <pub-id pub-id-type="doi">10.1007/s10763-005-9013-8</pub-id></mixed-citation></ref>
<ref id="B17"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hambleton</surname> <given-names>R. K.</given-names></name> <name><surname>Swaminathan</surname> <given-names>H.</given-names></name></person-group> (<year>1985</year>). <source><italic>Item Response Theory.</italic></source> <publisher-loc>Dordrecht</publisher-loc>: <publisher-name>Springer Netherlands</publisher-name>.</mixed-citation></ref>
<ref id="B18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hilbert</surname> <given-names>T. S.</given-names></name> <name><surname>Renkl</surname> <given-names>A.</given-names></name> <name><surname>Schworm</surname> <given-names>S.</given-names></name> <name><surname>Kessler</surname> <given-names>S.</given-names></name> <name><surname>Reiss</surname> <given-names>K.</given-names></name></person-group> (<year>2008</year>). <article-title>Learning to teach with worked-out examples: A computer-based learning environment for teachers.</article-title> <source><italic>J. Comp. Assis. Learn.</italic></source> <volume>24</volume> <fpage>316</fpage>&#x2013;<lpage>332</lpage>. <pub-id pub-id-type="doi">10.1111/j.1365-2729.2007.00266.x</pub-id></mixed-citation></ref>
<ref id="B19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>L.</given-names></name> <name><surname>Bentler</surname> <given-names>P. M.</given-names></name></person-group> (<year>1999</year>). <article-title>Cutoff criteria for fit indexes in covariance structure analysis: Conventional criteria versus new alternatives.</article-title> <source><italic>Struct. Equat. Model. A Multidisc. J.</italic></source> <volume>6</volume> <fpage>1</fpage>&#x2013;<lpage>55</lpage>. <pub-id pub-id-type="doi">10.1080/10705519909540118</pub-id></mixed-citation></ref>
<ref id="B20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Immekus</surname> <given-names>J. C.</given-names></name> <name><surname>Snyder</surname> <given-names>K. E.</given-names></name> <name><surname>Ralston</surname> <given-names>P. A.</given-names></name></person-group> (<year>2019</year>). <article-title>Multidimensional item response theory for factor structure assessment in educational psychology research.</article-title> <source><italic>Front. Educ.</italic></source> <volume>4</volume>:<fpage>45</fpage>. <pub-id pub-id-type="doi">10.3389/feduc.2019.00045</pub-id></mixed-citation></ref>
<ref id="B21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jang</surname> <given-names>H.</given-names></name></person-group> (<year>2016</year>). <article-title>Identifying 21st Century STEM competencies using workplace data.</article-title> <source><italic>J. Sci. Educ. Technol.</italic></source> <volume>25</volume> <fpage>284</fpage>&#x2013;<lpage>301</lpage>. <pub-id pub-id-type="doi">10.1007/s10956-015-9593-1</pub-id></mixed-citation></ref>
<ref id="B22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kelly</surname> <given-names>R.</given-names></name> <name><surname>McLoughlin</surname> <given-names>E.</given-names></name> <name><surname>Finlayson</surname> <given-names>O. E.</given-names></name></person-group> (<year>2016</year>). <article-title>Analysing student written solutions to investigate if problem-solving processes are evident throughout.</article-title> <source><italic>Intern. J. Sci. Educ.</italic></source> <volume>38</volume> <fpage>1766</fpage>&#x2013;<lpage>1784</lpage>. <pub-id pub-id-type="doi">10.1080/09500693.2016.1214766</pub-id></mixed-citation></ref>
<ref id="B23"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kolen</surname> <given-names>M. J.</given-names></name> <name><surname>Brennan</surname> <given-names>R. L.</given-names></name></person-group> (<year>2014</year>). <source><italic>Test Equating, Scaling, and Linking.</italic></source> <publisher-loc>New York, NY</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="B24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>G.</given-names></name> <name><surname>Zhai</surname> <given-names>X.</given-names></name></person-group> (<year>2025</year>). <article-title>Realizing visual question answering for education: GPT-4V as a multimodal AI.</article-title> <source><italic>TechTrends</italic></source> <volume>69</volume> <fpage>271</fpage>&#x2013;<lpage>287</lpage>. <pub-id pub-id-type="doi">10.1007/s11528-024-01035-z</pub-id></mixed-citation></ref>
<ref id="B25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>G.-G.</given-names></name> <name><surname>Latif</surname> <given-names>E.</given-names></name> <name><surname>Wu</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>N.</given-names></name> <name><surname>Zhai</surname> <given-names>X.</given-names></name></person-group> (<year>2024</year>). <article-title>Applying large language models and chain-of-thought for automatic scoring.</article-title> <source><italic>Comp. Educ. Art. Intell.</italic></source> <volume>6</volume>:<fpage>100213</fpage>. <pub-id pub-id-type="doi">10.1016/j.caeai.2024.100213</pub-id></mixed-citation></ref>
<ref id="B26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Linacre</surname> <given-names>J. M.</given-names></name></person-group> (<year>2002</year>). <article-title>What do infit and outfit, mean-square and standardized mean?</article-title> <source><italic>Rasch Measurement Trans.</italic></source> <volume>16</volume>:<fpage>878</fpage>.</mixed-citation></ref>
<ref id="B27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Martinez</surname> <given-names>M. E.</given-names></name></person-group> (<year>1998</year>). <article-title>What is problem solving?</article-title> <source><italic>Phi Delta Kappan</italic></source> <volume>79</volume> <fpage>605</fpage>&#x2013;<lpage>609</lpage>.</mixed-citation></ref>
<ref id="B28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meyer</surname> <given-names>A.</given-names></name> <name><surname>Bleckmann</surname> <given-names>T.</given-names></name> <name><surname>Friege</surname> <given-names>G.</given-names></name></person-group> (<year>2025a</year>). <article-title>Automatic feedback on physics tasks using open-source generative artificial intelligence.</article-title> <source><italic>Intern. J. Sci. Educ.</italic></source> <fpage>1</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1080/09500693.2025.2499220</pub-id> <comment>[Epub ahead of print]</comment>.</mixed-citation></ref>
<ref id="B29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meyer</surname> <given-names>A.</given-names></name> <name><surname>Fischer</surname> <given-names>S.</given-names></name> <name><surname>Friege</surname> <given-names>G.</given-names></name></person-group> (<year>2025b</year>). <article-title>Analyzing problem-solving in secondary physics education: A rubric-guided approach to explore individual learning needs.</article-title> <source><italic>Phys. Rev. Phys. Educ. Res.</italic></source> <volume>22</volume>:<fpage>010111</fpage>. <pub-id pub-id-type="doi">10.1103/3bs7-fnrd</pub-id></mixed-citation></ref>
<ref id="B30"><mixed-citation publication-type="book"><collab>Ministry of School and Culture Lower Saxony</collab> (<year>2015</year>). <source><italic>Kerncurriculum f&#x00FC;r das Gymnasium Schuljahrg&#x00E4;nge 5-10 Naturwissenschaften. [Core curriculum for secondary schools, grades 5-10, natural sciences].</italic></source> <publisher-loc>German</publisher-loc>: <publisher-name>Ministry of School and Culture Lower Saxony</publisher-name>. German</mixed-citation></ref>
<ref id="B31"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Mosier</surname> <given-names>K.</given-names></name> <name><surname>Fischer</surname> <given-names>U.</given-names></name> <name><surname>Hoffman</surname> <given-names>R. R.</given-names></name> <name><surname>Klein</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). &#x201C;<article-title>Expert professional judgments and &#x201C;naturalistic decision making</article-title>,&#x201D; in <source><italic>The Cambridge Handbook of Expertise and Expert Performance</italic></source>, <role>eds</role> <person-group person-group-type="editor"><name><surname>Ericsson</surname> <given-names>K. A.</given-names></name> <name><surname>Hoffman</surname> <given-names>R. R.</given-names></name> <name><surname>Kozbelt</surname> <given-names>A.</given-names></name> <name><surname>Williams</surname> <given-names>A. M.</given-names></name></person-group> (<publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>), <fpage>453</fpage>&#x2013;<lpage>475</lpage>.</mixed-citation></ref>
<ref id="B32"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Neumann</surname> <given-names>K.</given-names></name></person-group> (<year>2014</year>). &#x201C;<article-title>Rasch-Analyse naturwissenschaftsbezogener Leistungstests</article-title>,&#x201D; in <source><italic>Methoden in der Naturwissenschaftsdidaktischen Forschung</italic></source>, <role>eds</role> <person-group person-group-type="editor"><name><surname>Kr&#x00FC;ger</surname> <given-names>D.</given-names></name> <name><surname>Parchmann</surname> <given-names>I.</given-names></name> <name><surname>Schecker</surname> <given-names>H.</given-names></name></person-group> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>355</fpage>&#x2013;<lpage>369</lpage>.</mixed-citation></ref>
<ref id="B33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Neumann</surname> <given-names>K.</given-names></name> <name><surname>Viering</surname> <given-names>T.</given-names></name> <name><surname>Boone</surname> <given-names>W. J.</given-names></name> <name><surname>Fischer</surname> <given-names>H. E.</given-names></name></person-group> (<year>2013</year>). <article-title>Towards a learning progression of energy.</article-title> <source><italic>J. Res. Sci. Teach.</italic></source> <volume>50</volume> <fpage>162</fpage>&#x2013;<lpage>188</lpage>. <pub-id pub-id-type="doi">10.1002/tea.21061</pub-id></mixed-citation></ref>
<ref id="B34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nilsen</surname> <given-names>T.</given-names></name> <name><surname>Angell</surname> <given-names>C.</given-names></name> <name><surname>Gr&#x00F8;nmo</surname> <given-names>L. S.</given-names></name></person-group> (<year>2013</year>). <article-title>Mathematical competencies and the role of mathematics in physics education: A trend analysis of TIMSS Advanced 1995 and 2008.</article-title> <source><italic>ADNO</italic></source> <volume>7</volume> <fpage>1</fpage>&#x2013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.5617/adno.1113</pub-id></mixed-citation></ref>
<ref id="B35"><mixed-citation publication-type="book"><collab>OECD</collab> (<year>2013</year>). <source><italic>PISA 2012 Assessment and Analytical Framework.</italic></source> <publisher-loc>Paris</publisher-loc>: <publisher-name>OECD Publishing</publisher-name>.</mixed-citation></ref>
<ref id="B36"><mixed-citation publication-type="book"><collab>OECD</collab> (<year>2023</year>). <source><italic>PISA 2022 Results.</italic></source> <publisher-loc>Paris</publisher-loc>: <publisher-name>OECD Publishing</publisher-name>.</mixed-citation></ref>
<ref id="B37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Plass</surname> <given-names>J. L.</given-names></name> <name><surname>Pawar</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>Toward a taxonomy of adaptivity for learning.</article-title> <source><italic>J. Res. Technol. Educ.</italic></source> <volume>52</volume> <fpage>275</fpage>&#x2013;<lpage>300</lpage>. <pub-id pub-id-type="doi">10.1080/15391523.2020.1719943</pub-id></mixed-citation></ref>
<ref id="B38"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>P&#x00F3;lya</surname> <given-names>G.</given-names></name></person-group> (<year>1945</year>). <source><italic>How to Solve it.</italic></source> <publisher-loc>Princeton, NJ</publisher-loc>: <publisher-name>Princeton Univeristy Press</publisher-name>.</mixed-citation></ref>
<ref id="B39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Potvin</surname> <given-names>P.</given-names></name> <name><surname>Hasni</surname> <given-names>A.</given-names></name></person-group> (<year>2014</year>). <article-title>Interest, motivation and attitude towards science and technology at K-12 levels: A systematic review of 12 years of educational research.</article-title> <source><italic>Stud. Sci. Educ.</italic></source> <volume>50</volume> <fpage>85</fpage>&#x2013;<lpage>129</lpage>. <pub-id pub-id-type="doi">10.1080/03057267.2014.881626</pub-id></mixed-citation></ref>
<ref id="B40"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Price</surname> <given-names>A.</given-names></name> <name><surname>Salehi</surname> <given-names>S.</given-names></name> <name><surname>Burkholder</surname> <given-names>E.</given-names></name> <name><surname>Kim</surname> <given-names>C.</given-names></name> <name><surname>Isava</surname> <given-names>V.</given-names></name> <name><surname>Flynn</surname> <given-names>M.</given-names></name><etal/></person-group> (<year>2022</year>). <article-title>An accurate and practical method for assessing science and engineering problem-solving expertise.</article-title> <source><italic>Intern. J. Sci. Educ.</italic></source> <volume>44</volume> <fpage>2061</fpage>&#x2013;<lpage>2084</lpage>. <pub-id pub-id-type="doi">10.1080/09500693.2022.2111668</pub-id></mixed-citation></ref>
<ref id="B41"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ramalingam</surname> <given-names>D.</given-names></name> <name><surname>Philpot</surname> <given-names>R.</given-names></name> <name><surname>McCrae</surname> <given-names>B.</given-names></name></person-group> (<year>2017</year>). &#x201C;<article-title>The PISA 2012 assessment of problem solving</article-title>,&#x201D; in <source><italic>The Nature of Problem Solving</italic></source>, <role>eds</role> <person-group person-group-type="editor"><name><surname>Csap&#x00F3;</surname> <given-names>B.</given-names></name> <name><surname>Funke</surname> <given-names>J.</given-names></name></person-group> (<publisher-loc>Paris</publisher-loc>: <publisher-name>OECD Publishing</publisher-name>), <fpage>75</fpage>&#x2013;<lpage>91</lpage>.</mixed-citation></ref>
<ref id="B42"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Revelle</surname> <given-names>W.</given-names></name></person-group> (<year>2007</year>). <source><italic>psych: Procedures for Psychological, Psychometric, and Personality Research. R package version 2.5.6</italic>.</source> <publisher-loc>Northwestern University, Evanston, IL</publisher-loc>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=psych">https://CRAN.R-project.org/package=psych</ext-link></mixed-citation></ref>
<ref id="B43"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rosseel</surname> <given-names>Y.</given-names></name></person-group> (<year>2012</year>). <article-title>lavaan: An R package for structural equation modeling.</article-title> <source><italic>J. Stat. Soft.</italic></source> <volume>48</volume> <fpage>1</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v048.i02</pub-id></mixed-citation></ref>
<ref id="B44"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rost</surname> <given-names>J.</given-names></name></person-group> (<year>2004</year>). <source><italic>Lehrbuch Testtheorie - Testkonstruktion. [Textbook Test Theory - Test Construction].</italic></source> <publisher-loc>Bern</publisher-loc>: <publisher-name>Verlag Hans Huber</publisher-name>. German</mixed-citation></ref>
<ref id="B45"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Smith</surname> <given-names>M. U.</given-names></name></person-group> (<year>1991</year>). <source><italic>Toward a Unified Theory of Problem Solving: Views from the Content Domains.</italic></source> <publisher-loc>Hillsdale, N.J</publisher-loc>: <publisher-name>L. Erlbaum Associates</publisher-name>.</mixed-citation></ref>
<ref id="B46"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tong</surname> <given-names>T.</given-names></name> <name><surname>Pi</surname> <given-names>F.</given-names></name> <name><surname>Zheng</surname> <given-names>S.</given-names></name> <name><surname>Zhong</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>X.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>Exploring the effect of mathematics skills on student performance in physics problem-solving: A structural equation modeling analysis.</article-title> <source><italic>Res. Sci. Educ.</italic></source> <volume>55</volume> <fpage>489</fpage>&#x2013;<lpage>509</lpage>. <pub-id pub-id-type="doi">10.1007/s11165-024-10201-5</pub-id></mixed-citation></ref>
<ref id="B47"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tr&#x00E4;ff</surname> <given-names>U.</given-names></name> <name><surname>Olsson</surname> <given-names>L.</given-names></name> <name><surname>Skagerlund</surname> <given-names>K.</given-names></name> <name><surname>Skagenholt</surname> <given-names>M.</given-names></name> <name><surname>&#x00D6;stergren</surname> <given-names>R.</given-names></name></person-group> (<year>2019</year>). <article-title>Logical reasoning, spatial processing, and verbal working memory: Longitudinal predictors of physics achievement at Age 12-13 Years.</article-title> <source><italic>Front. Psychol.</italic></source> <volume>10</volume>:<fpage>1929</fpage>. <pub-id pub-id-type="doi">10.3389/fpsyg.2019.01929</pub-id> <pub-id pub-id-type="pmid">31496982</pub-id></mixed-citation></ref>
<ref id="B48"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tschisgale</surname> <given-names>P.</given-names></name> <name><surname>Kubsch</surname> <given-names>M.</given-names></name> <name><surname>Wulff</surname> <given-names>P.</given-names></name> <name><surname>Petersen</surname> <given-names>S.</given-names></name> <name><surname>Neumann</surname> <given-names>K.</given-names></name></person-group> (<year>2025</year>). <article-title>Exploring the sequential structure of students&#x2019; physics problem-solving approaches using process mining and sequence analysis.</article-title> <source><italic>Phys. Rev. Phys. Educ. Res.</italic></source> <volume>21</volume>:<fpage>010111</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevPhysEducRes.21.010111</pub-id></mixed-citation></ref>
<ref id="B49"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tschisgale</surname> <given-names>P.</given-names></name> <name><surname>Wulff</surname> <given-names>P.</given-names></name> <name><surname>Kubsch</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>Integrating artificial intelligence-based methods into qualitative research in physics education research: A case for computational grounded theory.</article-title> <source><italic>Phys. Rev. Phys. Educ. Res.</italic></source> <volume>19</volume>:<fpage>020123</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevPhysEducRes.19.020123</pub-id></mixed-citation></ref>
<ref id="B50"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tuminaro</surname> <given-names>J.</given-names></name> <name><surname>Redish</surname> <given-names>E. F.</given-names></name></person-group> (<year>2007</year>). <article-title>Elements of a cognitive model of physics problem solving: Epistemic games.</article-title> <source><italic>Phys. Rev. ST Phys. Educ. Res.</italic></source> <volume>3</volume>&#x201D;020101. <pub-id pub-id-type="doi">10.1103/PhysRevSTPER.3.020101</pub-id></mixed-citation></ref>
<ref id="B51"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>M.</given-names></name> <name><surname>Adams</surname> <given-names>R.</given-names></name></person-group> (<year>2006</year>). <article-title>Modelling mathematics problem solving item responses using a multidimensional IRT model.</article-title> <source><italic>Math. Ed. Res. J.</italic></source> <volume>18</volume> <fpage>93</fpage>&#x2013;<lpage>113</lpage>. <pub-id pub-id-type="doi">10.1007/BF03217438</pub-id></mixed-citation></ref>
<ref id="B52"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Z&#x00F6;ttl</surname> <given-names>L.</given-names></name> <name><surname>Ufer</surname> <given-names>S.</given-names></name> <name><surname>Reiss</surname> <given-names>K.</given-names></name></person-group> (<year>2011</year>). &#x201C;<article-title>Assessing modelling competencies using a multidimensional IRT approach</article-title>,&#x201D; in <source><italic>Trends in Teaching and Learning of Mathematical Modelling</italic></source>, <role>eds</role> <person-group person-group-type="editor"><name><surname>Kaiser</surname> <given-names>G.</given-names></name> <name><surname>Blum</surname> <given-names>W.</given-names></name> <name><surname>Borromeo Ferri</surname> <given-names>R.</given-names></name> <name><surname>Stillman</surname> <given-names>G.</given-names></name></person-group> (<publisher-loc>Dordrecht</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>427</fpage>&#x2013;<lpage>437</lpage>.</mixed-citation></ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2896023/overview">Konstantinos T. Kotsis</ext-link>, University of Ioannina, Greece</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3323857/overview">Salman Rashid</ext-link>, Yogyakarta State University, Indonesia</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3334867/overview">Selma Riyasni</ext-link>, Padang State University, Indonesia</p></fn>
</fn-group>
</back>
</article>