<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Educ.</journal-id>
<journal-title>Frontiers in Education</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Educ.</abbrev-journal-title>
<issn pub-type="epub">2504-284X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/feduc.2025.1501796</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Education</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>At-risk student identification and interventions for data science programs at a South African university</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Smit</surname>
<given-names>Neill</given-names>
</name>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2852561/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Osler</surname>
<given-names>Zonia</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/3216308/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>van der Merwe</surname>
<given-names>Leandra</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/3134897/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff><institution>Centre for Business Mathematics and Informatics &#x0026; Unit for Data Science and Computing, North-West University</institution>, <addr-line>Potchefstroom</addr-line>, <country>South Africa</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1203054/overview">Mona Hmoud AlSheikh</ext-link>, Imam Abdulrahman Bin Faisal University, Saudi Arabia</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2423045/overview">Najah Al-Shanableh</ext-link>, Al al-Bayt University, Jordan</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2638593/overview">Mahdi-Reza Borna</ext-link>, Tarbiat Modares University, Iran</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Neill Smit, <email>Neill.Smit@nwu.ac.za</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>02</day>
<month>10</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>10</volume>
<elocation-id>1501796</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>09</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>09</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 Smit, Osler and van der Merwe.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Smit, Osler and van der Merwe</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>In this paper, thresholds are established to identify at-risk data science students at a South African university and an intervention process is proposed for handling identified at-risk students. An evaluation of student performance in the core program modules is conducted, focusing on the differences between the standard and extended data science programs offered by this university. Through this evaluation, mark thresholds are specified for core mathematics and statistics modules that can be used to detect at-risk students. A statistical analysis is conducted to determine the suitability of using the thresholds for identifying at-risk students. A fitted logistic regression model, using the number of threshold breaches as the predictor, yields significant predictor coefficients and odds ratios for both programs (<italic>p</italic>&#x202F;=&#x202F;0.0014 and OR&#x202F;=&#x202F;4.0367 for the standard program; <italic>p</italic>&#x202F;=&#x202F;0.0405 and OR&#x202F;=&#x202F;2.1174 for the extended program). For both programs, the Mann&#x2013;Whitney test confirms a statistically significant difference in the number of threshold breaches between graduates and dropouts (<italic>p</italic> &#x003C;&#x202F;0.0001; <italic>p</italic> =&#x202F;0.0273) and Fisher&#x2019;s exact test indicates an association between the number of breaches and dropout status (<italic>p</italic> =&#x202F;0.0002; <italic>p</italic> =&#x202F;0.0312). Lastly, sensitivity/specificity analysis using the number of breaches to classify students yields estimated AUC values of 0.7811 and 0.7074, respectively. An intervention process is also suggested for the data science programs to provide struggling students with advice throughout their academic life cycles. This study shows how a simple threshold approach can be used to design an understandable and program-specific at-risk identification strategy. Literature on extended programs is less common than literature on bridging programs, where the differences between these transition programs are also highlighted in this paper.</p>
</abstract>
<kwd-group>
<kwd>at-risk students</kwd>
<kwd>data science education</kwd>
<kwd>extended programs</kwd>
<kwd>intervention process</kwd>
<kwd>program evaluation</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="10"/>
<equation-count count="1"/>
<ref-count count="42"/>
<page-count count="15"/>
<word-count count="12139"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Higher Education</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<sec id="sec2">
<label>1.1</label>
<title>Data science education</title>
<p>Data science is one of the fastest-growing career sectors in the world. The United States Bureau of Labor Statistics projects an employment growth rate for data scientists in the United States of 35% from 2022 to 2032, compared to an average employment growth rate over all occupations of 3% for the same period (<xref ref-type="bibr" rid="ref39">United States Bureau of Labor Statistics, 2023</xref>). LinkedIn lists data scientists, as well as three other jobs related to data science, namely financial technology engineer, machine learning specialist, and big data specialist, under its list of the top ten fastest growing jobs for 2022 (<xref ref-type="bibr" rid="ref23">LinkedIn, 2022</xref>). Glassdoor placed data scientist as the third best job in its list of 50 best jobs in the United States for 2022, where several other data science related jobs also made this list (<xref ref-type="bibr" rid="ref16">Glassdoor, 2023</xref>). Due to the high demand for data scientists, the demand for education and training in data science has skyrocketed, with more and more universities offering data science programs (see, for example, <xref ref-type="bibr" rid="ref13">De Veaux et al., 2017</xref>; <xref ref-type="bibr" rid="ref41">Voulo et al., 2024</xref>).</p>
<p>A host of literature exists on the design of data science programs and the development of this field at university level. Only some of the relevant literature is discussed here. The early data science education research focuses on the technical skills required by data scientists, although the multidisciplinary nature of data science is also considered. <xref ref-type="bibr" rid="ref10">Cleveland (2001)</xref> discusses an action plan to expand six technical areas of statistics and data science for a university department, where the author provides guidelines for resource allocation in data science degrees.</p>
<p>In later research, there is a clear shift towards the growing incorporation of computers in data science education, as well as the integration of the fields of mathematics, statistics, and computer science as the building blocks for data science education. Challenges in statistics education, as well as new teaching innovations and the reform of statistics education, are considered in <xref ref-type="bibr" rid="ref38">Tishkovskaya and Lancaster (2012)</xref>. The authors recommend the incorporation of information technology and real-world practical problems in statistics education, as well as the use of web-based learning materials to supplement teaching. A comprehensive overview of the development of data science is given in <xref ref-type="bibr" rid="ref8">Cao (2017)</xref>. The author discusses various topics, such as the evolution of data science, major challenges and innovations in data science, data competency and education, industrialization and new career opportunities, and the future of data science. <xref ref-type="bibr" rid="ref13">De Veaux et al. (2017)</xref> provide curriculum guidelines for undergraduate data science programs based on inputs from various mathematics, statistics, and computer science university departments across the United States. The integration of courses in these three fields, together with a capstone project, is seen as crucial in data science degrees.</p>
<p>The most recent research emphasizes the need for practical experience and applications to real-world problems as part of data science education. Furthermore, the needs and involvement of government and industry must be considered during the design of data science programs at universities. <xref ref-type="bibr" rid="ref42">Zakari (2020)</xref> addresses the development of statistics and data science at university levels in Niger, focusing on the importance of collaboration with government and industry, as well as collaboration between university departments that need statistics courses to develop academics and researchers with the necessary skills. The design and development of a four-year undergraduate data science program at a university in Bhutan, with inputs from external stakeholders in terms of their needs, is discussed in <xref ref-type="bibr" rid="ref28">Namgay et al. (2022)</xref>. The authors focus not only on the course content and the multidisciplinary nature thereof, but also on the process followed to develop the program and obtain approval for its implementation from the university board. <xref ref-type="bibr" rid="ref14">De Veaux et al. (2022)</xref> expand on the work of <xref ref-type="bibr" rid="ref13">De Veaux et al. (2017)</xref>, by emphasizing some key aspects of data science education that may be overlooked in many data science degrees. The authors recommend that education in data science should focus not only on the theoretical background and the straightforward application to data, but also on training future data scientists to solve real-world problems. Special attention should be given to defining the purpose for which the data were collected, assessing the quality and integrity of the data, thinking about ethical considerations, and effectively communicating the key findings that address the original problem (<xref ref-type="bibr" rid="ref14">De Veaux et al., 2022</xref>).</p>
<p>Due to the mathematical and statistical nature of most data science programs, the admission requirements are typically very high, and few students are allowed into these programs. Keeping in mind the global need for data science graduates, some universities offer bridging and extended data science programs to provide an opportunity for admission to these programs for more students.</p>
</sec>
<sec id="sec3">
<label>1.2</label>
<title>Bridging programs in higher education</title>
<p>The National Plan for Higher Education (NPHE) emphasizes that raising participation rates in higher education relies on enhancing the system&#x2019;s efficiency by increasing the number of graduates it produces (<xref ref-type="bibr" rid="ref11">Council on Higher Education, 2014</xref>). To adhere to the national plan, bridging programs for prospective university students act as an intervention to increase the graduation rates and the performance of students entering universities. Typically for science, technology, engineering, and mathematics (STEM) degrees, the transition from school to university can be challenging, and bridging programs can ease this transition by introducing prospective students to some core concepts before the start of the academic year. These bridging programs also increase the number of students enrolled for certain degrees. Not only is the increase in participation adhering to the national plan, but it is also of great relevance for STEM professions, as there are major skills shortages in these professions and students often do not meet the admission requirements for STEM degrees. In this case, bridging programs are typically aimed at those students who missed the admission requirements by a small margin. After participating in the bridging program, students are tested to determine if they have gained the necessary skills to qualify for the degree they applied for.</p>
<p>Most of the research on the efficacy of bridging programs shows positive results in terms of student retention and performance. Some relevant literature, mostly focusing on bridging programs for STEM degrees, is discussed next.</p>
<p><xref ref-type="bibr" rid="ref27">Murphy et al. (2010)</xref> determine the effect of a bridging program on the graduation rates of minority groups in scientific and technical disciplines at an American university. The authors report that participation in a bridging program contributes to student retention and significantly increases the likelihood of graduation of the participants. <xref ref-type="bibr" rid="ref37">Ssempebwa et al. (2012)</xref> investigate the effectiveness of university bridging programs at a Ugandan university, where the bridging program is aimed at attracting international students who would otherwise not qualify for admission. The authors find that the bridging program is effective and that there is not a significant difference in the performance of students who were admitted through the bridging program against that of students who were admitted through conventional routes.</p>
<p><xref ref-type="bibr" rid="ref35">Raines (2012)</xref> investigates the efficacy of a bridging program, which is aimed at addressing mathematics deficiencies in STEM majors, at an American university. The author states that the bridging program positively impacted the performance and retention rates of the participants. <xref ref-type="bibr" rid="ref6">Bradford et al. (2021)</xref> consider the effectiveness of university bridging programs, with a focus on STEM students. The authors analyze STEM bridging programs from 16 universities across the United States and find that participation in the programs had a significant effect on first-year performance and retention. <xref ref-type="bibr" rid="ref7">Brady and Gallant (2021)</xref> report on a qualitative assessment of a bridging program for minority groups enrolling in STEM programs at an American university. The authors state that the participants felt that the program not only increased their knowledge of relevant mathematics and science but also facilitated their transition to university.</p>
<p>Besides the challenge of students not meeting program admission requirements, universities also face the challenge of high dropout rates in data science programs. This creates a need for interventions and the early identification of students who may potentially drop out.</p>
</sec>
<sec id="sec4">
<label>1.3</label>
<title>Literature on at-risk student identification</title>
<p>The rise in dropout rates among university students remains a major concern for higher education administrators, with some institutions in South Africa experiencing dropout rates as high as 80% (<xref ref-type="bibr" rid="ref26">Moodley and Singh, 2015</xref>). This issue is particularly evident in more challenging programs, such as statistics and data science. <xref ref-type="bibr" rid="ref3">Babalola et al. (2022)</xref> discuss the challenges that lead to high dropout rates among undergraduate students enrolled in statistics programs and states that dropout rates in developing countries are always high among undergraduate students studying statistics. Major stress factors that contribute to students not advancing to the next year, include financial difficulties, accommodation issues, academic pressures, and incorrect field of study due to limited information regarding their career choice (<xref ref-type="bibr" rid="ref34">Pillay and Ngcobo, 2010</xref>). It is crucial for institutions to identify at-risk students early to retain them through intervention strategies, especially for emerging and growing fields such as data science (<xref ref-type="bibr" rid="ref3">Babalola et al., 2022)</xref>.</p>
<p>The identification of at-risk students has been investigated in many schools and institutions. One approach to early identification is to implement performance thresholds from the start of the students&#x2019; academic program. A study on such thresholds is <xref ref-type="bibr" rid="ref17">Gordanier et al. (2019)</xref>, where the effectiveness of early academic intervention in economics courses at a large public university is investigated. Students who fell below a 70% threshold on a performance measure or had an attendance rate below 75% were referred to the university&#x2019;s student success center for additional academic support. The authors state that the interventions improved student scores on common questions on the final exam by 6.5 to 7.5 percentage points for students at or near the performance threshold. The gains were particularly large for students who entered college with below-average mathematics placement scores. In another study by <xref ref-type="bibr" rid="ref4">Beitelmal et al. (2022)</xref>, the identification of threshold concepts in higher education, particularly in introductory statistics courses, is discussed. The authors argue that identifying and focusing on threshold concepts (key ideas that are crucial to understanding a subject) can help instructors address areas where students often struggle, leading to improved comprehension and performance.</p>
<p>Most of the recent research papers on at-risk student identification focus on predictive modeling, using machine learning and deep learning models. Given the extensive literature available on this topic, we discuss only a selection of recent publications in this area of research. For the interested reader, these selected papers also refer to many other studies on educational data mining, student performance prediction, and identification of at-risk students.</p>
<p><xref ref-type="bibr" rid="ref12">Cummings and Smolkowski (2015)</xref> discuss the use of receiver operating characteristic (ROC) curves and the area under the curve (AUC) to determine appropriate thresholds/cut-offs for identifying at-risk students via predictive models or screeners. <xref ref-type="bibr" rid="ref31">Ortiz-Lozano et al. (2020)</xref> use classification trees (CT) built on academic and socio-demographic data to identify at-risk university students. Their findings support the need for early identification and interventions, and indicate that academic data is the main contributor to making accurate predictions. A Bayesian profile regression approach based on data from undergraduate students at an Italian university, including students&#x2019; performance, motivation, and resilience, is investigated in <xref ref-type="bibr" rid="ref36">Sarra et al. (2019)</xref>. The authors were able to group students into nine profiles, each characterized by different dropout rates and combinations of covariates. <xref ref-type="bibr" rid="ref1">Al-Shabandar et al. (2019)</xref> consider several machine learning models, including random forest (RF), logistic regression (LR), gradient boosting machine (GBM), and neural network (NN), to identify students who are at risk of dropping out of large open online courses. Their study indicated that all the classifiers performed well in terms of accuracy, with GBM achieving the highest accuracy.</p>
<p><xref ref-type="bibr" rid="ref40">Veerasamy et al. (2020)</xref> consider CT and RF, based on data from early course work, for predicting student performance in an introductory programming course. <xref ref-type="bibr" rid="ref18">Jamjoom et al. (2021)</xref> accurately predict whether students would pass a course based on preliminary performance in the course. The authors use CT, k-nearest neighbors (kNN), na&#x00EF;ve Bayes (NB) classifier, and support vector machines (SVM). All models performed very well in terms of accuracy, with CT and SVM achieving the highest accuracy. Various machine learning models, such as NB, RF, CT, kNN, SVM, AdaBoost, and LR, are investigated by <xref ref-type="bibr" rid="ref32">Pek et al. (2022)</xref> for identifying at-risk students. Again, all models achieve high accuracy, with an ensemble model using SVM as the meta learner identified as the best model after optimizing hyperparameters. <xref ref-type="bibr" rid="ref19">Jang et al. (2022)</xref> use several machine learning models to identify at-risk students in seven courses at a Korean university. Mostly online behavioral features are used as variables and LR was found to be the best model based on performance metrics such as AUC and accuracy.</p>
<p><xref ref-type="bibr" rid="ref9">Carneiro et al. (2022)</xref> consider kNN, CT, RF, NB, NN, and pruning-based rule induction for at-risk student identification. Their feature set includes socio-demographic and geographical variables in addition to academic performance variables. All the machine learning models had high accuracy, with the pruning-based rule induction being the best performing model. <xref ref-type="bibr" rid="ref21">K&#x00F6;hler et al. (2022)</xref> use a wide range of machine learning models to predict which students are at risk of failing an introductory course at a Chilean university. The study involves engineering students who can choose between a 4-year program and a 6-year program. The authors identify SVM as the best performing model in terms of accuracy. <xref ref-type="bibr" rid="ref5">Borna et al. (2024)</xref> analyze data from the Open University Learning Analytics Dataset to identify students who are at risk of withdrawing. The authors explored several classification models and found that RF had the highest accuracy. <xref ref-type="bibr" rid="ref2">Atindama et al. (2025)</xref> discuss the impact of targeted interventions on the retention of at-risk engineering students at a private research university. The study focuses on historically underrepresented students and three different intervention strategies. The authors use LR to predict on-time graduation, before and after interventions, and find that the tailored intervention strategies are effective. <xref ref-type="bibr" rid="ref20">Kalita et al. (2025)</xref> consider a bidirectional long short-term memory (bi-LSTM) network to predict student performance and identify at-risk students at an American university. The authors find that the bi-LSTM model outperforms several other machine learning models, achieving an accuracy of 88%.</p>
<p>Given the discussed literature, most machine learning models can be used to accurately identify at-risk students. However, the best performing model varies across studies. This observation is supported by <xref ref-type="bibr" rid="ref19">Jang et al. (2022)</xref>, where several studies are listed that identify different best performing machine learning models for at-risk student identification.</p>
<p>Once the at-risk students are identified, action should be taken to intervene. The intervention strategies employed include transitional and orientation classes, motivation, and building positive relationships to improve the literacy and learning skills of students (<xref ref-type="bibr" rid="ref24">Lowder et al., 2022</xref>). <xref ref-type="bibr" rid="ref36">Sarra et al. (2019)</xref> suggest that all intervention programs should improve students&#x2019; resilience by enhancing their ability to plan and set goals to manage their studies. <xref ref-type="bibr" rid="ref33">P&#x00E9;rez (1998)</xref> describes a strategy that will first divide the at-risk students into meaningful subsets and then offer support to assist with the everyday problems, connection opportunities to allow networking between students, and transformation strategies to overcome barriers preventing students from reaching their full potential.</p>
</sec>
<sec id="sec5">
<label>1.4</label>
<title>Motivation and layout of the paper</title>
<p>The North-West University (NWU) is one of the largest universities in South Africa, with over 50,000 students across its three campuses in Potchefstroom, Mahikeng, and Vanderbijlpark. The NWU is a balanced teaching-learning and research university that offers a broad spectrum of programs across eight faculties, with unique strengths and demographics on each campus. The NWU typically ranks among the top 1,000 universities in the world, according to several ranking systems (see, for example, <xref ref-type="bibr" rid="ref30">North-West University, 2025b</xref>). The Vanderbijlpark campus (VC), located next to the Vaal River, is the smallest and fastest growing campus of the three. The vast majority of the students on the VC are African, where many of them come from schools with limited resources and poor households. Most of these students also make use of government subsidized student loans, which includes a small monthly stipend that is often their only means for covering basic living expenses. Due to under-resourced schooling, students often do not meet the requirements for the standard three-year degree programs. Therefore, extended programs for several degrees are mainly offered on the VC.</p>
<p>Lecturers in the data science programs at the VC of the NWU have recently become concerned with the dropout rates for these programs. The differences in graduation rates of the standard and extended data science programs are of specific importance, since the extended programs were introduced to provide an opportunity for students from under-resourced schools who do not meet the requirements for the standard programs to study a data science program. The secondary aim of introducing the extended programs was to increase the number of data science graduates, due to the need for more qualified data scientists in South Africa.</p>
<p>These are very demanding degrees, resulting in many students taking exceptionally long to graduate or dropping out of the programs after several years. The lecturers have recognized the importance of providing students with guidance regarding their future studies, as many dropouts leave the university without a formal qualification. Some factors contributing to student dropouts for the VC data science programs have been identified, which include the following:</p>
<list list-type="bullet">
<list-item>
<p>Financial and personal challenges</p>
</list-item>
<list-item>
<p>Adaptability to a new environment</p>
</list-item>
<list-item>
<p>Difficulty and intensity of the programs</p>
</list-item>
<list-item>
<p>Lack/gaps in mathematics foundation</p>
</list-item>
<list-item>
<p>Move to another program</p>
</list-item>
<list-item>
<p>COVID-19 pandemic</p>
</list-item>
</list>
<p>In this paper, a simple threshold approach based on academic performance data is formulated to identify students who are at risk of dropping out of the data science programs at the VC. The aim of our research study is to determine the following:</p>
<list list-type="bullet">
<list-item>
<p>Can this simple threshold approach be used to effectively identify at-risk students in these programs?</p>
</list-item>
<list-item>
<p>Is there a significant difference between the graduation rates for the standard and extended programs?</p>
</list-item>
<list-item>
<p>Can an intervention process for at-risk students in these programs, aimed at guiding students through their academic life cycles, be proposed?</p>
</list-item>
</list>
<p>Identifying at-risk students early allows for discussions and interventions with students whose performances indicate that they are unlikely to graduate within the maximum time allowed by the university. We establish threshold marks for core modules in the programs through an evaluation of student performance. The main reason for opting for this benchmarking approach is for easier communication and application, compared to the use of machine learning models. Easily understandable thresholds are straightforward to communicate to students early in their academic life cycle and can serve as motivation for them to meet these thresholds rather than just trying to pass modules. The goal of the study is not to build the most accurate predictive model, for which machine learning models would be more appropriate. The use of understandable thresholds also makes it easier to develop a structured and practically applicable intervention process. While the thresholds are program-specific, a similar process can be followed to develop a tailored framework for other degree programs. We perform a statistical analysis on the use of the thresholds, including LR, formal tests, and sensitivity/specificity analysis, which further supports our approach.</p>
<p>There is limited literature that focus on extended degree programs, since these programs are much less common than bridging and other transitional programs. This paper may address this gap in literature by highlighting the differences between extended programs and bridging programs. The aim of these discussions is to stimulate conversations among educators regarding the role and viability of extended programs at their own institutions, particularly for degrees relating to STEM fields and professional areas experiencing skill shortages.</p>
<p>The contribution of this work and its future application is of great importance for various reasons. First, we believe that program-specific guidelines that can assist in the early identification of students at risk of dropping out, together with an intervention process, could possibly increase graduation rates. Furthermore, students referred to other programs through the intervention process could at least leave the university with alternative degrees, rather than simply dropping out of university after several years with no formal qualifications. Second, improving graduation rates specifically for the extended programs could motivate the introduction of extended data science programs at other South African universities. The structural differences between the standard and extended programs at the VC could then provide a foundation for developing such programs at other universities. Third, several statistics and data science related professions are classified by the South African government as critical skills. This classification means that there are major shortages in these professions, ranging from corporate jobs to university lecturers and teachers. Furthermore, many qualified South African data scientists readily find work overseas and emigrate, mainly due to socio-political concerns in South Africa, which further exacerbates local skill shortages. Should the intervention process prove successful, more data science graduates could enter the job market to alleviate these skill shortages.</p>
<p>The remainder of the paper is structured as follows. In Section 2, the data science programs offered at the VC are briefly discussed. The key differences between the extended and standard programs are also highlighted. In Section 3, the performances of students participating in both extended and standard programs are evaluated. Mark thresholds for core modules are also identified in this section, with the aim of identifying at-risk students. The efficacy of the thresholds is evaluated against a more recent cohort of students. Lastly, statistical methods for evaluating the performance of the simple threshold approach are discussed. In Section 4, the results and interpretations of the statistical analyses are presented, and an intervention process, which has recently been employed for these programs, is suggested. The paper is concluded in Section 5 with some closing remarks.</p>
</sec>
</sec>
<sec id="sec6">
<label>2</label>
<title>The extended data science programs</title>
<p>The VC presents three data science related Bachelor of Science degree programs with different specializations. The standard programs take a minimum of 3 years to complete, where students have very busy schedules throughout the durations of the programs. The core modules of these degrees are centered around mathematics, statistics, and programming, with a focus on applications to business and finance. Modules on economics, accounting and business ethics are also included in all three programs. The specializations include degrees in financial mathematics (FM), quantitative risk management (QRM), and business analytics (BA).</p>
<p>The FM program includes additional modules on more advanced mathematics, covering topics such as multivariate calculus and real analysis, as well as some modules on risk management. The QRM program focuses more on risk management courses, including topics such as investment management, bank risk management, financial markets, and financial risk management. The BA program incorporates several additional modules on programming, covering topics such as object-oriented programming, data structures and algorithms, databases, and decision support systems.</p>
<p>Since these data science programs are mathematically demanding, the most important admission requirement is that applicants should have a mark of at least 70% for mathematics in Grade 12 (their final year of high school). However, in 2014, extended programs for these degrees were introduced to attract more students to the programs on the VC of the NWU. In addition to a lower admission point score, applicants need a mark of at least 50% for mathematics in grade 12 to qualify for the extended programs. The reader is referred to <xref ref-type="bibr" rid="ref29">North-West University (2025a)</xref> for the complete admission requirements, detailed descriptions of each data science program, and the content of the modules included in each program. A discussion on the academic preparedness of students and the performance of the first two cohorts of students in all extended programs presented at the VC is provided in <xref ref-type="bibr" rid="ref15">Du Plessis and Gerber (2012)</xref>.</p>
<p>The extended data science programs at the VC were developed using certain principles of traditional bridging programs. The extension of the three-year data science programs to four-year programs could be seen as degrees for which there are prolonged bridging programs. However, there are some key differences and features.</p>
<p>Firstly, admission to the extended programs is not contingent on the student&#x2019;s performance in a test after completion of a short bridging program. The purpose is to allow students who do not qualify for the standard programs to enroll for these degrees via the extended programs, where they will build up the necessary skills to participate in the remainder of the programs.</p>
<p>Secondly, the degrees are extended by 1 year, where the first-year mathematics and statistics modules of the standard programs are split over the first 2 years in the extended programs. Additional basic mathematics and statistics modules are used in the extended programs to bring students who did not qualify for the standard programs up to speed. Thus, the programs are designed such that the core modules are aligned when extended program students enter their third year and standard program students enter their second year.</p>
<p>Lastly, the extended programs are in no way seen as inferior to the standard programs. Since all work covered in the standard programs is also covered in the extended programs, students graduating from the different programs should be equipped with the same skillset. Furthermore, students from the different programs are treated equally in terms of applications for postgraduate studies in these data science programs.</p>
<p>There have been many success stories from these extended programs. However, an increased number of student dropouts has been observed in recent years, which warrants a thorough investigation into the success of the programs. The recent implementation of proper procedures to terminate the studies of repeatedly underperforming students in both the extended and standard programs further motivates the need for such an investigation.</p>
</sec>
<sec id="sec7">
<label>3</label>
<title>Methodology and establishment of performance thresholds</title>
<sec id="sec8">
<label>3.1</label>
<title>Investigation into student performance</title>
<p>In this section, we present an analysis of the performance of students enrolled in the standard and extended data science programs at VC. The first part of the analysis focuses on key indicators such as graduation and dropout rates for both the standard and extended programs. This is followed by an evaluation of the number of years it took students to obtain their degrees. Using box-and-whisker plots, we examine the performance in core mathematics and statistics modules to serve as the foundation for establishing performance thresholds, which are motivated and discussed in Section 3.2.</p>
<p>The dataset for this analysis consists of the registration information and academic performance of all students who registered for the data science programs between 2014 and 2018. The performance of these students was considered up to the end of 2022 to allow at least the minimum qualification time for the 2018 registrants. The data was obtained from the Integrated Planning &#x0026; Strategic Intelligence department at the NWU. It should be noted that the data represents complete student records, where the module marks for all completed modules are recorded. The reflected marks represent finalized module marks and have thus already undergone validation and approval from the respective educators for each module. After the extraction of the data, simple data cleaning steps are performed. These data cleaning steps include merging or elimination of duplicate records and, where necessary, correcting administrative inaccuracies from alternative student records. The extracted cohort consists of 125 students in total, where the performance of the entire cohort is considered in this section. Of the 125 students, 73 were registered for the standard programs and 52 were registered for the extended programs.</p>
<p>Students were categorized into three groups, based on their progression in the programs. The first group consisted of the students who graduated from the programs. The second group consisted of students who dropped out of the programs, whether they discontinued their studies or switched to another program presented at the VC. Dropouts are thus defined as students who did not complete any of the data science programs and are also not still busy with any of these programs. The last group consisted of ongoing students, which are those students who were still enrolled in the programs at the end of 2022.</p>
<p><xref ref-type="table" rid="tab1">Tables 1</xref>, <xref ref-type="table" rid="tab2">2</xref> display the categorization of the students for the standard and extended programs, respectively. The FM program is by far the most popular for both the standard and extended programs, followed by the QRM program. Overall, more students register for the standard programs than for the extended programs. Clear differences in the graduation and dropout rates can be observed between the programs. The dropout rate for the extended FM program is almost double that of the standard FM program. The opposite holds for the QRM programs, where the dropout rate for the extended QRM program is much lower than that of the standard QRM program. The overall dropout rates for the standard and extended programs are very similar, but this could be attributed in part to the higher proportion of ongoing students in the extended programs. The higher proportion of ongoing students in the extended program also explains the much lower overall graduation rate for these programs.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Categorization of students for the standard programs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Standard programs</th>
<th align="center" valign="top">Graduates</th>
<th align="center" valign="top">Dropouts</th>
<th align="center" valign="top">Ongoing</th>
<th align="center" valign="top">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Financial mathematics</td>
<td align="center" valign="middle">24</td>
<td align="center" valign="middle">11</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="middle">39 (53.4%)</td>
</tr>
<tr>
<td align="left" valign="middle">Quantitative risk management</td>
<td align="center" valign="middle">12</td>
<td align="center" valign="middle">13</td>
<td align="center" valign="middle">0</td>
<td align="center" valign="middle">25 (34.2%)</td>
</tr>
<tr>
<td align="left" valign="middle">Business analytics</td>
<td align="center" valign="middle">5</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="middle">0</td>
<td align="center" valign="middle">9 (12.3%)</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="center" valign="middle">41 (56.2%)</td>
<td align="center" valign="middle">28 (38.4%)</td>
<td align="center" valign="middle">4 (5.5%)</td>
<td align="center" valign="middle">73 (100%)</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Categorization of students for the extended programs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Extended programs</th>
<th align="center" valign="top">Graduates</th>
<th align="center" valign="top">Dropouts</th>
<th align="center" valign="top">Ongoing</th>
<th align="center" valign="top">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Financial mathematics</td>
<td align="center" valign="middle">7</td>
<td align="center" valign="middle">16</td>
<td align="center" valign="middle">9</td>
<td align="center" valign="middle">32 (61.5%)</td>
</tr>
<tr>
<td align="left" valign="middle">Quantitative risk management</td>
<td align="center" valign="middle">7</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="middle">1</td>
<td align="center" valign="middle">12 (23.1%)</td>
</tr>
<tr>
<td align="left" valign="middle">Business analytics</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">8 (15.4%)</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="center" valign="middle">17 (32.7%)</td>
<td align="center" valign="middle">22 (42.3%)</td>
<td align="center" valign="middle">13 (25.0%)</td>
<td align="center" valign="middle">52 (100%)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The general findings suggest the need for support and intervention, particularly in addressing the high dropout rates, to improve program efficiency. We proceed with the analysis by examining how long it takes for students to graduate from the programs. <xref ref-type="table" rid="tab3">Table 3</xref> displays the time taken for students to graduate, measured in terms of the minimum number of years required to graduate from the programs (i.e., 3&#x202F;years for the standard programs and 4&#x202F;years for the extended programs). Note that most students in the extended programs do not complete their degrees in the minimum required time. Excluding the ongoing students in the dataset, only 25 of the 108 (23.1%) students from both the standard and extended programs graduate in the minimum required time. This extension of study years often leads to even more financial difficulty, since most of the students are subsidized through government bursaries with performance requirements. The need for a more in-depth look at student performance is motivated, where possible reasons why students are struggling in their study progression should be investigated in future research.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Time taken to graduate.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Program</th>
<th align="center" valign="top">Minimum required time</th>
<th align="center" valign="top">1 additional year</th>
<th align="center" valign="top">2&#x202F;+&#x202F;additional years</th>
<th align="center" valign="top">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Standard programs</td>
<td align="center" valign="middle">21</td>
<td align="center" valign="middle">13</td>
<td align="center" valign="middle">7</td>
<td align="center" valign="middle">41 (70.7%)</td>
</tr>
<tr>
<td align="left" valign="middle">Extended programs</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="middle">12</td>
<td align="center" valign="middle">1</td>
<td align="center" valign="middle">17 (29.3%)</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="center" valign="middle">25 (43.1%)</td>
<td align="center" valign="middle">25 (43.1%)</td>
<td align="center" valign="middle">8 (13.8%)</td>
<td align="center" valign="middle">58 (100%)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We continue our analysis by assessing the performance of students in the core mathematics and statistics modules within these programs. These modules were identified as core modules for two reasons. The first is that these modules consist of the most important technical work that data science students need to master. The second is that these modules serve as foundations for further modules that students often struggle with (motivated by discussions with various stakeholders in the programs). The module code and module name of the identified core modules are provided in <xref ref-type="table" rid="tab4">Table 4</xref>. The reader is referred to the NWU yearbooks (<xref ref-type="bibr" rid="ref29">North-West University, 2025a</xref>) for more details on these modules, where the general module outcomes for each module are described. The yearbooks also contain detailed descriptions of the full curriculums, student progression, and prerequisites for the standard and extended programs.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Description of core modules.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Module code</th>
<th align="left" valign="top">Module description</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">WISS121</td>
<td align="left" valign="middle">Introduction to Mathematics II (Extended)</td>
</tr>
<tr>
<td align="left" valign="middle">STTF125</td>
<td align="left" valign="middle">Introductory Statistical Inference (Extended)</td>
</tr>
<tr>
<td align="left" valign="middle">MTHS111</td>
<td align="left" valign="middle">Introductory Algebra and Calculus I</td>
</tr>
<tr>
<td align="left" valign="middle">MTHS121</td>
<td align="left" valign="middle">Introductory Algebra and Calculus II</td>
</tr>
<tr>
<td align="left" valign="middle">STTN215</td>
<td align="left" valign="middle">Probability and Sampling Theory</td>
</tr>
<tr>
<td align="left" valign="middle">STTN225</td>
<td align="left" valign="middle">Statistical Inference and Data Analysis</td>
</tr>
<tr>
<td align="left" valign="middle">MTHS211</td>
<td align="left" valign="middle">Multivariable Calculus I</td>
</tr>
<tr>
<td align="left" valign="middle">MTHS222</td>
<td align="left" valign="middle">Linear Algebra II</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec9">
<label>3.2</label>
<title>Establishing mark thresholds for at-risk identification</title>
<p>The marks obtained for the core mathematics and core statistics modules are presented as box-and-whisker plots in <xref ref-type="fig" rid="fig1">Figures 1</xref>, <xref ref-type="fig" rid="fig2">2</xref>, respectively. Note that the marks for each core module are split between graduates and dropouts, where the marks for ongoing students are not considered. These plots provide insight into the central tendency, spread, and presence of outliers in the dataset. The upper and lower whiskers of the box-and-whisker show the range of marks, with outliers indicated by dots, while the box itself illustrates the interquartile range, with the median indicated by the middle bar. As expected, a clear difference in the distribution of marks for graduates and dropouts can be observed. We used key points in the distributions, such as the quartiles, to inform the establishment of thresholds. The goals of these thresholds are to monitor student progression through the program and to assist lecturers in advising students based on their academic performance in these critical modules.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Boxplots of the distribution of marks for the core mathematics modules.</p>
</caption>
<graphic xlink:href="feduc-10-1501796-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Box plot showing marks for core mathematics modules, comparing graduates and dropouts across various courses. Each box represents the distribution of marks in percentages, with central lines indicating medians, boxes depicting interquartile ranges, and whiskers extending to minimum and maximum values. Outliers are marked outside the whiskers.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Boxplots of the distribution of marks for the core statistics modules.</p>
</caption>
<graphic xlink:href="feduc-10-1501796-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Boxplot comparing the marks of graduates and dropouts in core statistics modules STT125, STN215, and STN225. Marks range from 0 to 100 percent, with medians visibly differing across groups.</alt-text>
</graphic>
</fig>
<p>To inform a starting point for setting a threshold for each module, we considered the first quartile (Q1) of the graduates and the third quartile (Q3) of the dropouts. The reasoning behind this approach was to capture the majority of graduates with a type of lower bound, defined by Q1 of the graduates&#x2019; marks, while trying to exclude the majority of dropouts with a type of upper bound, defined by Q3 of the dropouts&#x2019; marks. Consequently, the logic was that the bottom 25% of students in the graduate group may have been at risk of dropping out, and the top 25% of students in the dropout group may have had a chance of improving their performance to graduate from the programs. As an example, consider the module MTHS111. From <xref ref-type="fig" rid="fig1">Figure 1</xref>, the value of Q1 for graduates is 65% and the value of Q3 for dropouts is 62.5%. Using these values as a basis for discussions with stakeholders, the threshold for MTHS111 was set at 65%. Additionally, groupings of the core modules were created to define more holistic thresholds, with small adjustments in some cases. For all the holistic thresholds, the average mark of the grouped modules is considered, and a student is allowed a maximum downward deviation of 5% for one module in each grouping.</p>
<p>Using this approach, together with small adjustments made in consultation with various lecturers involved in these modules, the thresholds given in <xref ref-type="table" rid="tab5">Table 5</xref> were recommended. The small adjustments involve discretionary rounding to align the thresholds with either the lower or upper 5% band based on the expert opinion of the lecturers involved. This was done in cases where there is a slight difference between Q1 for graduates and Q3 for dropouts in order to ease communication and applicability of the thresholds. For example, first-year standard program and second-year extended program students should achieve at least 65% for MTHS111 and 60% for MTHS121. Rather using the holistic thresholds, an average of 60% for the two modules should be achieved, where a maximum downward deviation of 5% is allowed for one of the two modules. That is, if a student achieves a mark of 55% for MTHS111, the student will have to achieve a mark of 65% in MTHS121 to have an average for this group of 60%.</p>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>Recommended thresholds for core modules.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Year level</th>
<th align="left" valign="top">Module</th>
<th align="center" valign="top">Average</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Extended first year</td>
<td align="left" valign="middle">WISS121 (70%) and STTF125 (70%)</td>
<td align="center" valign="middle">70%</td>
</tr>
<tr>
<td align="left" valign="middle">Standard first year<break/>Extended second year</td>
<td align="left" valign="middle">MTHS111 (65%) and MTHS121 (60%)</td>
<td align="center" valign="middle">60%</td>
</tr>
<tr>
<td align="left" valign="middle">Standard second year<break/>Extended third year</td>
<td align="left" valign="middle">STTN215 (60%) and STTN225 (55%)</td>
<td align="center" valign="middle">55%</td>
</tr>
<tr>
<td align="left" valign="middle">Standard second year<break/>Extended third year</td>
<td align="left" valign="middle">MTHS211 (55%) and MTHS222 (55%)</td>
<td align="center" valign="middle">55%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To test the validity of the established thresholds, we applied them to the students in our dataset, excluding those students who dropped out of the programs before the end of their first-year studies. This helped us evaluate how effective the thresholds are at identifying at-risk students. In <xref ref-type="fig" rid="fig3">Figure 3</xref>, we observe distinct differences in the adherence to the thresholds between dropouts and graduates, in both the standard and extended programs. We observe in the standard program that a higher percentage of graduates met all the criteria compared to dropouts, indicating the efficacy of the thresholds in predicting possible graduates. In the extended program, there is a similar trend, while the percentage of graduates meeting all criteria is somewhat lower than that of the standard program graduates. Many students who miss one of the thresholds still graduate, but it is clear that students missing two or more thresholds are much more likely to drop out of the programs. These insights highlight the practical value of the established thresholds in distinguishing between graduates and dropouts in both the standard and extended programs.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>The proportion of students who missed a certain number of thresholds for the standard and extended programs. (<bold>a</bold>) standard program graduates, (<bold>b</bold>) extended program graduates, (<bold>c</bold>) standard program dropouts, (<bold>d</bold>) extended program dropouts.</p>
</caption>
<graphic xlink:href="feduc-10-1501796-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Four pie charts showing the percentage of students among the standard and extended programs who missed a certain number of thresholds. Chart (a) for standard program graduates: Miss none 73%, Miss one 22%, Miss two 3%, Miss all 2%. Chart (b) for extended program graduates: Miss none 47%, Miss one 29%, Miss two 18%, Miss three 6%, Miss all 0%. Chart (c) for standard program dropouts: Miss all 53%, Miss none 21%, Miss one 16%, Miss two 10%. Chart (d) for extended program dropouts: Miss all 63%, Miss none 11%, Miss one 21%, Miss two 5%, Miss three 0%.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec10">
<label>3.3</label>
<title>Further evaluation of the thresholds on a more recent cohort</title>
<p>In this section, the efficacy and validity of the thresholds are further explored by applying them to the next cohort of students. The aim of this extension is to determine whether there has recently been an increase in the number of dropouts, which would further motivate the need for interventions. The validation dataset consists of the 17 ongoing students from the previous dataset, as well as students who registered for the programs between 2019 and 2023. The performance of the students up to the end of 2023 was considered. The validation dataset consisted of 116 students, of which 47 were enrolled in the standard programs and 69 were enrolled in the extended programs. The extended programs are now attracting more students than the standard programs. This highlights the importance of the extended programs as a tool for increased student intake, since these students would not have met the requirements for the standard programs.</p>
<p>The same data cleaning adjustments as described in Section 3.1 were made and the students were again categorized as graduates, dropouts, or ongoing students. The categorizations of the validation dataset for the standard and extended programs are given in <xref ref-type="table" rid="tab6">Tables 6</xref>, <xref ref-type="table" rid="tab7">7</xref>, respectively. The student numbers in this dataset are more evenly distributed between the three programs, compared to the dataset used in Section 3.1. Note that the numbers of ongoing students in the validation dataset are much higher, due to the performance being measured up to the end of 2023 while all students registered up to 2023 are considered. Thus, many students have only been enrolled for one, two, or three (for extended program students) years and cannot complete their degrees in this timeframe.</p>
<table-wrap position="float" id="tab6">
<label>Table 6</label>
<caption>
<p>Categorization of the more recent cohort of students for the standard programs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Standard programs</th>
<th align="center" valign="top">Graduates</th>
<th align="center" valign="top">Dropouts</th>
<th align="center" valign="top">Ongoing</th>
<th align="center" valign="top">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Financial mathematics</td>
<td align="center" valign="middle">5</td>
<td align="center" valign="middle">6</td>
<td align="center" valign="middle">7</td>
<td align="center" valign="middle">18 (38.3%)</td>
</tr>
<tr>
<td align="left" valign="middle">Quantitative risk management</td>
<td align="center" valign="middle">1</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">13</td>
<td align="center" valign="middle">16 (34.0%)</td>
</tr>
<tr>
<td align="left" valign="middle">Business analytics</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">7</td>
<td align="center" valign="middle">13 (27.7%)</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="center" valign="middle">9 (19.1%)</td>
<td align="center" valign="middle">11 (23.4%)</td>
<td align="center" valign="middle">27 (57.4%)</td>
<td align="center" valign="middle">47 (100%)</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab7">
<label>Table 7</label>
<caption>
<p>Categorization of the more recent cohort of students for the extended programs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Extended programs</th>
<th align="center" valign="top">Graduates</th>
<th align="center" valign="top">Dropouts</th>
<th align="center" valign="top">Ongoing</th>
<th align="center" valign="top">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Financial mathematics</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="middle">7</td>
<td align="center" valign="middle">15</td>
<td align="center" valign="middle">26 (37.7%)</td>
</tr>
<tr>
<td align="left" valign="middle">Quantitative risk management</td>
<td align="center" valign="middle">0</td>
<td align="center" valign="middle">5</td>
<td align="center" valign="middle">15</td>
<td align="center" valign="middle">20 (29.0%)</td>
</tr>
<tr>
<td align="left" valign="middle">Business analytics</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">7</td>
<td align="center" valign="middle">14</td>
<td align="center" valign="middle">23 (33.3%)</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="center" valign="middle">6 (8.7%)</td>
<td align="center" valign="middle">19 (27.5%)</td>
<td align="center" valign="middle">44 (63.8%)</td>
<td align="center" valign="middle">69 (100%)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Although the ratio of graduates to dropouts will significantly improve as ongoing students complete their degrees, it is still of great concern. It is important to note that proper procedures were implemented from 2023 to terminate the studies of students who had been busy with their degrees for too long and did not show sufficient progress throughout 2023. Discussions were also held with students who performed very poorly, where some of them were advised to convert to a more suitable program at the NWU, so that they could still obtain a degree before leaving the university. These factors also contributed to the higher dropout rates.</p>
<p>For the validation dataset, the time taken for students to graduate is displayed in <xref ref-type="table" rid="tab8">Table 8</xref>. There is a clear deterioration with respect to the time it takes students to graduate in this more recent dataset. The significant increase in the percentage of students who take 2 or more additional years to complete their degrees is of particular concern. Although the guidance accompanying the implementation of the derived thresholds might have a positive impact on the time taken to graduate, further investigation into the admission requirements for the programs might also be considered.</p>
<table-wrap position="float" id="tab8">
<label>Table 8</label>
<caption>
<p>Time taken to graduate for the more recent cohort of students.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Program</th>
<th align="center" valign="top">Minimum required time</th>
<th align="center" valign="top">1 additional year</th>
<th align="center" valign="top">2&#x202F;+&#x202F;additional years</th>
<th align="center" valign="top">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Standard programs</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">9 (60.0%)</td>
</tr>
<tr>
<td align="left" valign="middle">Extended programs</td>
<td align="center" valign="middle">1</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">6 (40.0%)</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="center" valign="middle">3 (20.0%)</td>
<td align="center" valign="middle">6 (40.0%)</td>
<td align="center" valign="middle">6 (40.0%)</td>
<td align="center" valign="middle">15 (100%)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Next, the performance of students against the thresholds set in Section 3.2 is investigated. Recall that there are three thresholds set for the standard program students and four for the extended program students. <xref ref-type="fig" rid="fig4">Figure 4</xref> displays the performance of graduates and dropouts for both programs against the thresholds.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>The performance of the more recent cohort of students against the thresholds in terms of the number of threshold breaches. (<bold>a</bold>) standard program graduates, (<bold>b</bold>) extended program graduates, (<bold>c</bold>) standard program dropouts, (<bold>d</bold>) extended program dropouts.</p>
</caption>
<graphic xlink:href="feduc-10-1501796-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Four pie charts showing the performance in terms of the number of threshold breaches for a more recent cohort of students. (a) Standard graduates: miss none 22%, miss one 34%, miss two 22%, miss all 22%. (b) Extended graduates: miss none 33%, miss one 0%, miss two 0%, miss all 0%, miss three 67%. (c) Standard dropouts: miss none 36%, miss one 46%, miss two 0%, miss all 18%. (d) Extended dropouts: miss none 26%, miss one 53%, miss two 5%, miss all 5%, miss three 11%.</alt-text>
</graphic>
</fig>
<p>It is interesting to see such large differences in threshold performance between the original dataset and the validation dataset. There is a higher percentage of graduates who miss more than one of the thresholds, supporting the concerns about declining student performance over the past few years. For the dropouts, however, there is a clear shift towards missing fewer thresholds before dropping out of the programs. This could be indicative that the interventions and procedures recently implemented are effective to some degree. Dropouts who miss no thresholds might also be attributed to financial or other factors.</p>
</sec>
<sec id="sec11">
<label>3.4</label>
<title>Assessing the validity of the threshold approach</title>
<p>To establish the validity of our simple threshold approach, several statistical methods and tests can be considered. For this analysis, missing a threshold is defined as a <italic>breach,</italic> where the number of breaches and dropout status for each student in the standard and extended programs are considered. Similar to the previous evaluations, the students of the standard and extended programs are considered separately. The methods considered can be used to determine whether there is a relationship between the number of breaches and dropout status and whether the number of threshold breaches can be used as an indicator of at-risk students.</p>
<p>First, we will consider two important formal tests. The tie-corrected Mann&#x2013;Whitney test (see, for example, <xref ref-type="bibr" rid="ref22">Lehmann and D&#x2019;Abrera, 2006</xref>) can be used to determine whether two groups differ in their distributions. In the context of our study, this test can be used to assess whether the number of breaches differs between the graduates and dropouts. The tie-corrected test is used since the number of breaches variable can only take on a few distinct values, resulting in many ties in the data. Fisher&#x2019;s exact test (see, for example, <xref ref-type="bibr" rid="ref25">Mehta and Patel, 1983</xref>) can be used to determine whether there is an association between two categorical variables. This test is more appropriate than a chi-square test, since exact <inline-formula>
<mml:math id="M1">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula>-values can be calculated for uneven group distributions and small sample sizes. For our study, this test can be used to establish if there is an association between the number of threshold breaches and dropout status.</p>
<p>Next, we will consider a logistic regression model that assesses the probability of being a dropout as a function of the number of breaches, which is given by</p>
<disp-formula>
<mml:math id="M2">
<mml:mtext>logit</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>Dropout</mml:mtext>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:mtext>Breaches</mml:mtext>
<mml:mo>.</mml:mo>
</mml:math>
</disp-formula>
<p>If the predictor coefficient <inline-formula>
<mml:math id="M3">
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula> is statistically significant, it would mean that the number of breaches is a suitable variable for identifying at-risk students. Furthermore, <inline-formula>
<mml:math id="M4">
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula> should be positive such that more breaches relates to a higher likelihood of dropping out.</p>
<p>Lastly, we will perform a sensitivity/specificity analysis, where the number of breaches is considered to classify students as dropouts or graduates. We consider all possible classification cases due to a limited number of possible classification cut-offs, where a student is classified as a dropout if they had a certain number of breaches or more. That is, sensitivity represents the proportion of dropouts correctly classified as dropouts under the specified classification cut-off, and specificity represents the proportion of correctly classified graduates for the same classification cut-off. The accuracy is also considered, which represents the overall proportion of correctly classified students under the specified classification cut-off. The values of these classification metrics can then be considered to determine an appropriate cut-off (number of breaches) to classify a student as an at-risk student. From the sensitivity/specificity analysis, we can also estimate the AUC. This metric indicates whether a model is effective at differentiating between two classes, which are the dropout and graduate classes in our case.</p>
</sec>
</sec>
<sec sec-type="results" id="sec12">
<label>4</label>
<title>Results and discussion</title>
<sec id="sec13">
<label>4.1</label>
<title>Statistical analysis of the thresholds</title>
<p>In this section, we provide the results of the statistical analysis of the thresholds established in Section 3.2 and provide insightful discussions. The analysis is based on the data of the first cohort of students, which was used to establish the thresholds, i.e., all first-time entries into the programs between 2014 and 2018, monitored up until the end of 2022 (see Section 3.1. for further information).</p>
<p>First, let us consider the two formal tests. The <inline-formula>
<mml:math id="M5">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula>-values of the Mann&#x2013;Whitney test for the standard program and extended programs are, respectively, &#x003C;0.0001 and 0.0273. This indicates a statistically significant difference in the number of breaches for the graduates and dropouts, at a 5% significance level, for both programs. That is, dropouts typically have a higher number of breaches than graduates, suggesting that the number of breaches is a meaningful measure in terms of identifying at-risk students. Fisher&#x2019;s exact test is applied to examine the association between the number of breaches and dropout status. For the standard program, the <inline-formula>
<mml:math id="M6">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula>-value is 0.0002 and for the extended program, the <inline-formula>
<mml:math id="M7">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula>-value is 0.0312. Both are significant at a 5% significance level, indicating that the number of breaches and dropout status are not independent in either program. This suggests that the number of breaches has a significant association with the dropout status, further supporting the use of the number of breaches for identifying at-risk students.</p>
<p>Next, we consider a logistic regression model using only the number of breaches as a predictor. <xref ref-type="table" rid="tab9">Table 9</xref> provides details on the fitted logistic regression models for the standard and extended programs. For both models, the coefficient of the predictor (number of breaches) is positive and statistically significant at a 5% significance level. This supports our previous findings in that the number of breaches can be used as a simple and effective indicator for identifying at-risk students. Furthermore, the odds ratios indicate that each additional breach increases the odds of dropping out by factors of approximately 4 and 2, respectively, for the standard and extended programs. Both fitted models show a considerable probability of dropping out of the programs even with a single breach and notably higher probabilities for two or more breaches. This indicates that identifying a student as at-risk when two breaches has occurred may be too late, highlighting the need for early interventions. Lastly, the probability of dropping out of the extended program while having zero breaches is double that of the standard program. This may suggest that the extended program does not adequately prepare students for the alignment with the standard program (third year of the extended program and second year of the standard program) or that the admission requirements need to be reconsidered.</p>
<table-wrap position="float" id="tab9">
<label>Table 9</label>
<caption>
<p>Logistic regression results.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Model details</th>
<th align="center" valign="top">Standard program</th>
<th align="center" valign="top">Extended program</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M8">
<mml:msub>
<mml:mover accent="true">
<mml:mi>&#x03B2;</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">&#x2212;1.7228</td>
<td align="center" valign="middle">&#x2212;0.8020</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M9">
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>value</mml:mtext>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">(&#x003C;0.0001)</td>
<td align="center" valign="middle">(0.1386)</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M10">
<mml:msub>
<mml:mover accent="true">
<mml:mi>&#x03B2;</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">1.3954</td>
<td align="center" valign="middle">0.7502</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M11">
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>value</mml:mtext>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">(0.0014)</td>
<td align="center" valign="middle">(0.0405)</td>
</tr>
<tr>
<td align="left" valign="middle">Odds ratio</td>
<td align="center" valign="middle">4.0367</td>
<td align="center" valign="middle">2.1174</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M12">
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>Dropout</mml:mtext>
<mml:mo>&#x2223;</mml:mo>
<mml:mtext>Breaches</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">0.1515</td>
<td align="center" valign="middle">0.3096</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M13">
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>Dropout</mml:mtext>
<mml:mo>&#x2223;</mml:mo>
<mml:mtext>Breaches</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">0.4189</td>
<td align="center" valign="middle">0.4871</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M14">
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>Dropout</mml:mtext>
<mml:mo>&#x2223;</mml:mo>
<mml:mtext>Breaches</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">0.7442</td>
<td align="center" valign="middle">0.6678</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M15">
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>Dropout</mml:mtext>
<mml:mo>&#x2223;</mml:mo>
<mml:mtext>Breaches</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">0.9215</td>
<td align="center" valign="middle">0.8098</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M16">
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>Dropout</mml:mtext>
<mml:mo>&#x2223;</mml:mo>
<mml:mtext>Breaches</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>4</mml:mn>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">NA</td>
<td align="center" valign="middle">0.9001</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Lastly, we consider the sensitivity/specificity analysis for both programs under the possible classification cut-offs when using the number of breaches to classify students as graduates or dropouts. These results are presented in <xref ref-type="table" rid="tab10">Table 10</xref>. Considering that the problem at hand is identifying students at risk of dropping out of the programs, it can be argued to some extent that identifying dropouts correctly is more important than identifying graduates correctly. This is due to the consequences of identifying a potential graduate as an at-risk student being less severe than the other way around. For the standard and extended programs, the sensitivity drops from 0.7895 to 0.3158 and from 0.8947 to 0.4210, respectively, between the classification cut-offs of &#x201C;<inline-formula>
<mml:math id="M17">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>&#x201D; and &#x201C;<inline-formula>
<mml:math id="M18">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>.&#x201D; These decreases in sensitivity further support that identifying students as at-risk after two or more breaches might be too late and that early interventions are required. Based on these findings, it is recommended that students should be flagged as at-risk as soon as they have one breach, so that early interventions can be made. A satisfactory specificity is also achieved for the classification cut-off of one or more breaches, while taking into account that correctly classifying dropouts is more important. The accuracy metric also supports early at-risk identification, where the highest accuracy is achieved when using a classification cut-off of one or more breaches, for both the standard and extended programs. The AUC for the standard and extended programs are 0.7811 and 0.7074, respectively. This indicates satisfactory discrimination between dropouts and graduates when using only the number of breaches to identify at-risk students.</p>
<table-wrap position="float" id="tab10">
<label>Table 10</label>
<caption>
<p>Sensitivity/specificity analysis results.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Classification cut-off</th>
<th align="center" valign="top">Sensitivity</th>
<th align="center" valign="top">Specificity</th>
<th align="center" valign="top">Accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" colspan="4">Standard program</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M19">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">1.0000</td>
<td align="center" valign="middle">0.0000</td>
<td align="center" valign="middle">0.3167</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M20">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">0.7895</td>
<td align="center" valign="middle">0.7317</td>
<td align="center" valign="middle">0.7500</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M21">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">0.3158</td>
<td align="center" valign="middle">0.9512</td>
<td align="center" valign="middle">0.7500</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M22">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="middle">0.0526</td>
<td align="center" valign="middle">0.9756</td>
<td align="center" valign="middle">0.6833</td>
</tr>
<tr>
<td align="left" valign="middle" colspan="4">Extended program</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M23">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="top">1.0000</td>
<td align="center" valign="middle">0.0000</td>
<td align="center" valign="middle">0.5278</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M24">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="top">0.8947</td>
<td align="center" valign="middle">0.4706</td>
<td align="center" valign="middle">0.6944</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M25">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="top">0.4210</td>
<td align="center" valign="middle">0.7647</td>
<td align="center" valign="middle">0.5833</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M26">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="top">0.2105</td>
<td align="center" valign="middle">0.9412</td>
<td align="center" valign="middle">0.5556</td>
</tr>
<tr>
<td align="left" valign="middle">
<inline-formula>
<mml:math id="M27">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>4</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>Breaches</mml:mtext>
</mml:math>
</inline-formula>
</td>
<td align="center" valign="top">0.2105</td>
<td align="center" valign="middle">1.0000</td>
<td align="center" valign="middle">0.5833</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In summary, the findings of the statistical analysis support the need for the early identification of at-risk students. Most of the methods used indicate that students should be classified as at-risk after their first breach. This also means that early interventions are necessary. Considering the changes in the sensitivity between classification cut-offs, it may be important to consider light interventions after one breach and more intensive interventions after two or more breaches.</p>
<p>In the next section, an intervention process for at-risk students in the data science programs is presented. This intervention process has recently been employed at the university in an attempt to improve student retention and provide students with relevant advice throughout their academic life cycles.</p>
</sec>
<sec id="sec14">
<label>4.2</label>
<title>A recently introduced intervention process</title>
<p>To be able to address the possible issues students have in terms of factors contributing to dropping out, a method is needed to flag the students who are struggling. As shown in Section 4.1, the number of threshold breaches can be effectively used to identify at-risk students. The statistical analysis supports classifying students as at-risk if one breach occurs, leading to the need for interventions early in the students&#x2019; academic life cycles. These thresholds can be used to guide students better so that they can complete a qualification in the shortest possible time. The following intervention process is suggested and has recently been employed for students enrolled in the data science programs at the VC:</p>
<list list-type="order">
<list-item>
<p>First threshold breach: Group discussions are held with students after not meeting a threshold for the first time. During these discussions, the students are advised on the required personal commitment and effective time management expected to obtain one of the data science degrees. The students are also given an opportunity to raise any collective concerns and issues, such as problems with certain modules, lecturers, and the general academic environment. Lastly, students are made aware of student counseling services offered by the university and are encouraged to attend an organized group awareness session.</p>
</list-item>
<list-item>
<p>Additional threshold breaches: A one-on-one discussion is held with a student who misses multiple thresholds. During this discussion, causes of poor academic performance are discussed with the student and guidance/assistance is provided by the lecturers, where possible. In the case of more personal issues, the student is referred to the university&#x2019;s student counseling services.</p>
</list-item>
<list-item>
<p>Threshold breach and failing core modules: After a student fails a core module, a one-on-one discussion is held with the student. During this discussion, the student is made aware of the consequences of repeated underperformance, and a first formal warning is issued to the student. The lecturers also facilitate a conversation on alternative study options to consider, where it may be in the best interest of the student to change to a less demanding degree rather than not obtain a degree at all. Should the student again fail core modules, the exercise is repeated, where a second and final formal warning is issued to the student. At this stage, it is made clear that failing any more modules would result in the termination of the student&#x2019;s studies at the university.</p>
</list-item>
<list-item>
<p>Failing modules after the final warning: Should a student not meet the specific conditions set out in the final formal warning, or fail any more modules, the student&#x2019;s studies within the faculty are terminated in accordance with the university rules.</p>
</list-item>
</list>
<p>The above interventions would of course be tailored to the specific circumstances of the students, to achieve the best possible solution for them. The aim of the intervention process is to prevent any students from reaching the point of their studies being terminated (as set out in the university rules), as this would result in the student leaving the university with no formal qualification. The current cohort of ongoing students will be monitored to investigate the efficacy of the proposed intervention process.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec15">
<label>5</label>
<title>Conclusion</title>
<p>In this paper, an evaluation of student performance and graduation rates was conducted for data science related programs at the VC. The comparison focused on the differences in performance between standard and extended data science programs. It is evident that the dropout rates of the extended programs are higher than those of the corresponding standard programs. From an initial cohort of students, thresholds were established to identify at-risk students in the programs. These thresholds were then applied to a more recent cohort of students, and concerns regarding declining student graduation rates were confirmed.</p>
<p>We can now consider whether our aims for the research study have been met. A statistical analysis was performed, which showed that a simple threshold approach, which is tailored to these specific programs, is effective in identifying at-risk students. It was also established that there is a significant difference in the graduation rates between the standard and extended programs, which indicates that a revision of the extended program framework for these programs might be required. Furthermore, the analysis showed that early identification of at-risk students is necessary, where we recommend classifying students as at-risk as soon as they have one threshold breach. The need for early interventions is also highlighted in this analysis. An intervention process was suggested that aims to improve student retention and provide students with appropriate advice throughout their academic life cycles. We hope that the discussions in this paper will encourage other educators to consider the role and viability of extended programs, as an alternative to short transitional programs, at their own institutions.</p>
<p>Although the study presents promising results for using the thresholds to identify at-risk data science students, several limitations should be acknowledged. First, the analysis is based on data from a single South African university, which may limit the generalizability of the findings to other institutions or contexts. The thresholds are tailored to our specific program and institution, but a similar process can be followed to develop a corresponding framework for other programs and institutions. Due to the limited literature and studies on extended programs, it is also not possible to thoroughly compare our findings to those of other international studies. Most of the existing literature focuses on the evaluation of bridging programs or short transitional programs, which are inherently different from an extended degree program (as highlighted in Section 2).</p>
<p>Additionally, although the threshold approach offers a simplistic at-risk identification method, it might not capture complex relationships between performance indicators and academic risk. The study also primarily considers academic performance in core modules, potentially overlooking other factors such as socio-economic background, mental health, and support systems, which could influence student success. Another limitation is the static nature of the thresholds, which may be fixed for a long period of time before considering a re-evaluation. Changes in the program structure, especially in the core modules, could necessitate a complete redesign of the approach without sufficient data or knowledge to motivate new thresholds.</p>
<p>Future research could consider expanding the dataset to include multiple institutions to enhance the robustness and generalizability of the thresholds. This would require an evaluation of other institutions&#x2019; program structures to identify equivalent core modules to use for the benchmarking approach. Such a study would also be limited to South African universities to ensure consistency in terms of the educational framework and background of participants. Generalizability of the threshold approach may, however, be limited due to module content and difficulty level differences across institutions.</p>
<p>The primary direction of future research on at-risk identification for our programs is to consider machine learning models. The use of more complex models could provide deeper insights into the multifactorial nature of academic risk. From the literature review in Section 1, it is clear that at least LR, CT, RF, and SVM should be considered. Furthermore, many studies include socio-demographic and online behavioral features to complement academic performance features. The NWU has a learner management system (LMS) for module administration, content distribution, and reporting, which can provide valuable insights on student engagement with module content and resources. The university also has certain socio-demographic information on students, which can be requested for research purposes, subject to ethics committee approval. Lastly, the intervention process is structured to gain insights into poor academic performance, where additional socio-economic and behavioral features can be created and tracked via group and individual discussions with students. The predictive performance of the machine learning models can then be compared to that of our simple threshold approach.</p>
<p>Lastly, a longitudinal study that track the long-term outcomes of students who receive interventions would help to assess the true impact of the proposed intervention process. This would involve monitoring dropout rates for the standard and extended programs over several years, while consistently adhering to the proposed interventions. Exploring non-academic indicators and student feedback could assist in refining the intervention process to be more holistic and responsive to student needs. However, such a study would require an extensive timeframe to properly evaluate the impact and implement changes over different groups of students. The study by <xref ref-type="bibr" rid="ref2">Atindama et al. (2025)</xref> could serve as a valuable source for such a longitudinal study, where the efficacy of our intervention process and possible improvements to the intervention process can be considered.</p>
<p>As a closing note, consideration should be given to the fact that there is a significant proportion of graduates from the extended programs who would not otherwise have been given the opportunity to study towards obtaining a data science degree. Since there are many success stories, discontinuing the extended programs without proactive efforts to improve the graduation rates would demonstrate a lack of foresight, particularly considering the scarcity of STEM graduates in South Africa. However, it may be necessary to revise the admission requirements for the programs if the suggested intervention process is not effective.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec16">
<title>Data availability statement</title>
<p>The data analyzed in this study is subject to the following licenses/restrictions: confidential student data from a specific institution are used and only summarized results on the data are discussed in this paper. Requests to access these datasets should be directed to <email>Neill.Smit@nwu.ac.za</email>.</p>
</sec>
<sec sec-type="ethics-statement" id="sec17">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Faculty of Natural and Agricultural Sciences Ethics Committee of the North-West University. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="sec18">
<title>Author contributions</title>
<p>NS: Conceptualization, Formal analysis, Investigation, Methodology, Project administration, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. ZO: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. LM: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="funding-information" id="sec19">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="sec20">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec21">
<title>Generative AI statement</title>
<p>The authors declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec22">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Al-Shabandar</surname><given-names>R.</given-names></name> <name><surname>Hussain</surname><given-names>A. J.</given-names></name> <name><surname>Liatsis</surname><given-names>P.</given-names></name> <name><surname>Keight</surname><given-names>R.</given-names></name></person-group> (<year>2019</year>). <article-title>Detecting at-risk students with early interventions using machine learning techniques</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>149464</fpage>&#x2013;<lpage>149478</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2943351</pub-id></citation></ref>
<ref id="ref2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Atindama</surname><given-names>E.</given-names></name> <name><surname>Ramsdell</surname><given-names>M.</given-names></name> <name><surname>Wick</surname><given-names>D. P.</given-names></name> <name><surname>Mondal</surname><given-names>S.</given-names></name> <name><surname>Athavale</surname><given-names>P.</given-names></name></person-group> (<year>2025</year>). <article-title>Impact of targeted interventions on success of high-risk engineering students: a focus on historically underrepresented students in STEM</article-title>. <source>Front. Educ.</source> <volume>10</volume>:<fpage>1435279</fpage>. doi: <pub-id pub-id-type="doi">10.3389/feduc.2025.1435279</pub-id></citation></ref>
<ref id="ref3"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Babalola</surname><given-names>B. T.</given-names></name> <name><surname>Awe</surname><given-names>O. O.</given-names></name> <name><surname>Adarabioyo</surname><given-names>M. I.</given-names></name></person-group> (<year>2022</year>). &#x201C;Challenges of statistics education that leads to high dropout rate among undergraduate statistics students in developing countries&#x201D; in <source>Promoting statistical practice and collaboration in developing countries</source>, ed. <person-group person-group-type="editor"><name><surname>Awe</surname><given-names>O. O.</given-names></name> <name><surname>Love</surname><given-names>K.</given-names></name> <name><surname>Vance</surname><given-names>E. A.</given-names></name></person-group> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Chapman and Hall/CRC</publisher-name>), <fpage>75</fpage>&#x2013;<lpage>83</lpage>.</citation></ref>
<ref id="ref4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Beitelmal</surname><given-names>W. H.</given-names></name> <name><surname>Littlejohn</surname><given-names>R.</given-names></name> <name><surname>Okonkwo</surname><given-names>P. C.</given-names></name> <name><surname>Hassan</surname><given-names>I. U.</given-names></name> <name><surname>Barhoumi</surname><given-names>E. M.</given-names></name> <name><surname>Khozaei</surname><given-names>F.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Threshold concepts theory in higher education &#x2013; introductory statistics courses as an example</article-title>. <source>Educ. Sci.</source> <volume>12</volume>:<fpage>748</fpage>. doi: <pub-id pub-id-type="doi">10.3390/educsci12110748</pub-id></citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Borna</surname><given-names>M. R.</given-names></name> <name><surname>Saadat</surname><given-names>H.</given-names></name> <name><surname>Hojjati</surname><given-names>A. T.</given-names></name> <name><surname>Akbari</surname><given-names>E.</given-names></name></person-group> (<year>2024</year>). <article-title>Analyzing click data with AI: implications for student performance prediction and learning assessment</article-title>. <source>Front. Educ.</source> <volume>9</volume>:<fpage>1421479</fpage>. doi: <pub-id pub-id-type="doi">10.3389/feduc.2024.1421479</pub-id></citation></ref>
<ref id="ref6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bradford</surname><given-names>B. C.</given-names></name> <name><surname>Beier</surname><given-names>M. E.</given-names></name> <name><surname>Oswald</surname><given-names>F. L.</given-names></name></person-group> (<year>2021</year>). <article-title>A meta-analysis of university STEM summer bridge program effectiveness</article-title>. <source>CBE Life Sci. Educ.</source> <volume>20</volume>:<fpage>ar21</fpage>. doi: <pub-id pub-id-type="doi">10.1187/cbe.20-03-0046</pub-id>, PMID: <pub-id pub-id-type="pmid">33856899</pub-id></citation></ref>
<ref id="ref7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brady</surname><given-names>A.</given-names></name> <name><surname>Gallant</surname><given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>Stem bridge program: underrepresented minority students&#x2019; perceptions of Louis stokes Alliance for minority participation program impact</article-title>. <source>J. Coll. Sci. Teach.</source> <volume>50</volume>, <fpage>57</fpage>&#x2013;<lpage>62</lpage>. doi: <pub-id pub-id-type="doi">10.1080/0047231X.2021.12290534</pub-id></citation></ref>
<ref id="ref8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cao</surname><given-names>L.</given-names></name></person-group> (<year>2017</year>). <article-title>Data science: a comprehensive overview</article-title>. <source>ACM Comput. Surv.</source> <volume>50</volume>:<fpage>43</fpage>. doi: <pub-id pub-id-type="doi">10.1145/3076253</pub-id></citation></ref>
<ref id="ref9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Carneiro</surname><given-names>M. G.</given-names></name> <name><surname>Dutra</surname><given-names>B. L.</given-names></name> <name><surname>Paiva</surname><given-names>J. G. S.</given-names></name> <name><surname>Gabriel</surname><given-names>P. H. R.</given-names></name> <name><surname>Ara&#x00FA;jo</surname><given-names>R. D.</given-names></name></person-group> (<year>2022</year>). <article-title>Educational data mining to support identification and prevention of academic retention and dropout: a case study in introductory programming</article-title>. <source>Rev. Bras. Inform. Educ.</source> <volume>30</volume>, <fpage>379</fpage>&#x2013;<lpage>395</lpage>. doi: <pub-id pub-id-type="doi">10.5753/rbie.2022.2518</pub-id></citation></ref>
<ref id="ref10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cleveland</surname><given-names>W. S.</given-names></name></person-group> (<year>2001</year>). <article-title>Data science: an action plan for expanding the technical areas of the field of statistics</article-title>. <source>Int. Stat. Rev.</source> <volume>69</volume>, <fpage>21</fpage>&#x2013;<lpage>26</lpage>. doi: <pub-id pub-id-type="doi">10.1111/j.1751-5823.2001.tb00477.x</pub-id></citation></ref>
<ref id="ref11"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll1">Council on Higher Education</collab></person-group>. (<year>2014</year>) Framework for institutional quality enhancement in the second period of quality assurance. Available online at: <ext-link xlink:href="https://www.che.ac.za/sites/default/files/publications/QEP%20Framework%20Feb%202014.pdf" ext-link-type="uri">https://www.che.ac.za/sites/default/files/publications/QEP%20Framework%20Feb%202014.pdf</ext-link> (Accessed June 18, 2025).</citation></ref>
<ref id="ref12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cummings</surname><given-names>K. D.</given-names></name> <name><surname>Smolkowski</surname><given-names>K.</given-names></name></person-group> (<year>2015</year>). <article-title>Selecting students at risk of academic difficulties</article-title>. <source>Assess. Eff. Interv.</source> <volume>41</volume>, <fpage>55</fpage>&#x2013;<lpage>61</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1534508415590396</pub-id>, PMID: <pub-id pub-id-type="pmid">40949303</pub-id></citation></ref>
<ref id="ref13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>De Veaux</surname><given-names>R. D.</given-names></name> <name><surname>Agarwal</surname><given-names>M.</given-names></name> <name><surname>Averett</surname><given-names>M.</given-names></name> <name><surname>Baumer</surname><given-names>B. S.</given-names></name> <name><surname>Bray</surname><given-names>A.</given-names></name> <name><surname>Bressoud</surname><given-names>T. C.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Curriculum guidelines for undergraduate programs in data science</article-title>. <source>Ann. Rev. Stat. Appl.</source> <volume>4</volume>, <fpage>15</fpage>&#x2013;<lpage>30</lpage>. doi: <pub-id pub-id-type="doi">10.1146/annurev-statistics-060116-053930</pub-id></citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>De Veaux</surname><given-names>R. D.</given-names></name> <name><surname>Hoerl</surname><given-names>R.</given-names></name> <name><surname>Snee</surname><given-names>R.</given-names></name> <name><surname>Velleman</surname><given-names>P.</given-names></name></person-group> (<year>2022</year>). <article-title>Towards holistic data science education</article-title>. <source>Stat. Educ. Res. J.</source> <volume>21</volume>:<fpage>2</fpage>. doi: <pub-id pub-id-type="doi">10.52041/serj.v21i2.40</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Du Plessis</surname><given-names>L.</given-names></name> <name><surname>Gerber</surname><given-names>D.</given-names></name></person-group> (<year>2012</year>). <article-title>Academic preparedness of students &#x2013; an exploratory study</article-title>. <source>J. Transdiscipl. Res. South. Afr.</source> <volume>8</volume>, <fpage>81</fpage>&#x2013;<lpage>94</lpage>. doi: <pub-id pub-id-type="doi">10.4102/td.v8i1.7</pub-id></citation></ref>
<ref id="ref16"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll2">Glassdoor</collab></person-group> (<year>2023</year>). 50 best jobs in America for 2022. Available online at: <ext-link xlink:href="https://www.glassdoor.com/List/Best-Jobs-in-America-LST_KQ0,20.htm" ext-link-type="uri">https://www.glassdoor.com/List/Best-Jobs-in-America-LST_KQ0,20.htm</ext-link> (Accessed August 20, 2024).</citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gordanier</surname><given-names>J.</given-names></name> <name><surname>Hauk</surname><given-names>W.</given-names></name> <name><surname>Sankaran</surname><given-names>C.</given-names></name></person-group> (<year>2019</year>). <article-title>Early intervention in college classes and improved student outcomes</article-title>. <source>Econ. Educ. Rev.</source> <volume>72</volume>, <fpage>23</fpage>&#x2013;<lpage>29</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.econedurev.2019.05.003</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jamjoom</surname><given-names>M. M.</given-names></name> <name><surname>Alabdulkreem</surname><given-names>E. A.</given-names></name> <name><surname>Hadjouni</surname><given-names>M.</given-names></name> <name><surname>Karim</surname><given-names>F. K.</given-names></name> <name><surname>Qarh</surname><given-names>M. A.</given-names></name></person-group> (<year>2021</year>). <article-title>Early prediction for at-risk students in an introductory programming course based on student self-efficacy</article-title>. <source>Informatica</source> <volume>45</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.31449/inf.v45i6.3528</pub-id></citation></ref>
<ref id="ref19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jang</surname><given-names>Y.</given-names></name> <name><surname>Choi</surname><given-names>S.</given-names></name> <name><surname>Jung</surname><given-names>H.</given-names></name> <name><surname>Kim</surname><given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>Practical early prediction of students&#x2019; performance using machine learning and eXplainable AI</article-title>. <source>Educ. Inf. Technol.</source> <volume>27</volume>, <fpage>12855</fpage>&#x2013;<lpage>12889</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10639-022-11120-6</pub-id></citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kalita</surname><given-names>E.</given-names></name> <name><surname>Alfarwan</surname><given-names>A. M.</given-names></name> <name><surname>El Aouifi</surname><given-names>H.</given-names></name> <name><surname>Kukkar</surname><given-names>A.</given-names></name> <name><surname>Hussain</surname><given-names>S.</given-names></name> <name><surname>Ali</surname><given-names>T.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Predicting student academic performance using bi-LSTM: a deep learning framework with SHAP-based interpretability and statistical validation</article-title>. <source>Front. Educ.</source> <volume>10</volume>:<fpage>1581247</fpage>. doi: <pub-id pub-id-type="doi">10.3389/feduc.2025.1581247</pub-id></citation></ref>
<ref id="ref21"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>K&#x00F6;hler</surname><given-names>J.</given-names></name> <name><surname>Hidalgo</surname><given-names>L.</given-names></name> <name><surname>Jara</surname><given-names>J. L.</given-names></name></person-group> (<year>2022</year>). <article-title>Using machine learning techniques to predict academic success in an introductory programming course</article-title> in <conf-name>2022 41st international conference of the Chilean computer science society (SCCC)</conf-name> (<publisher-loc>Chile</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>.</citation></ref>
<ref id="ref22"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Lehmann</surname><given-names>E. L.</given-names></name> <name><surname>D&#x2019;Abrera</surname><given-names>H. J.</given-names></name></person-group> (<year>2006</year>). <source>Nonparametrics: Statistical methods based on ranks</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>Springer</publisher-name>.</citation></ref>
<ref id="ref23"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll3">LinkedIn</collab></person-group> (<year>2022</year>) Top 10 emerging and declining jobs in 2022. Available online at: <ext-link xlink:href="https://www.linkedin.com/pulse/top-10-emerging-declining-jobs-2022-teamleasedigital/" ext-link-type="uri">https://www.linkedin.com/pulse/top-10-emerging-declining-jobs-2022-teamleasedigital/</ext-link> (Accessed August 18, 2024).</citation></ref>
<ref id="ref24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lowder</surname><given-names>C.</given-names></name> <name><surname>O&#x2019;Brien</surname><given-names>C.</given-names></name> <name><surname>Hancock</surname><given-names>D.</given-names></name> <name><surname>Hachen</surname><given-names>J.</given-names></name> <name><surname>Wang</surname><given-names>C.</given-names></name></person-group> (<year>2022</year>). <article-title>High school success: a learning strategies intervention to reduce drop-out rates</article-title>. <source>Urban Rev.</source> <volume>54</volume>, <fpage>509</fpage>&#x2013;<lpage>530</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11256-021-00624-z</pub-id></citation></ref>
<ref id="ref25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mehta</surname><given-names>C. R.</given-names></name> <name><surname>Patel</surname><given-names>N. R.</given-names></name></person-group> (<year>1983</year>). <article-title>A network algorithm for performing fisher&#x2019;s exact test in r x c contingency tables</article-title>. <source>J. Am. Stat. Assoc.</source> <volume>78</volume>, <fpage>427</fpage>&#x2013;<lpage>434</lpage>. doi: <pub-id pub-id-type="doi">10.1080/01621459.1983.10477989</pub-id></citation></ref>
<ref id="ref26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Moodley</surname><given-names>P.</given-names></name> <name><surname>Singh</surname><given-names>R. J.</given-names></name></person-group> (<year>2015</year>). <article-title>Addressing student dropout rates at South African universities</article-title>. <source>Alternation</source> <volume>17</volume>, <fpage>91</fpage>&#x2013;<lpage>115</lpage>. Available online at: <ext-link xlink:href="https://alternation.ukzn.ac.za/Files/docs/22%20SpEd17/06%20Moodley%20F.pdf" ext-link-type="uri">https://alternation.ukzn.ac.za/Files/docs/22%20SpEd17/06%20Moodley%20F.pdf</ext-link></citation></ref>
<ref id="ref27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Murphy</surname><given-names>T. E.</given-names></name> <name><surname>Gaughan</surname><given-names>M.</given-names></name> <name><surname>Hume</surname><given-names>R.</given-names></name> <name><surname>Moore</surname><given-names>S. G.</given-names></name></person-group> (<year>2010</year>). <article-title>College graduation rates for minority students in a selective technical university: will participation in a summer bridge program contribute to success?</article-title> <source>Educ. Eval. Policy Anal.</source> <volume>32</volume>, <fpage>70</fpage>&#x2013;<lpage>83</lpage>. doi: <pub-id pub-id-type="doi">10.3102/0162373709360064</pub-id>, PMID: <pub-id pub-id-type="pmid">23136456</pub-id></citation></ref>
<ref id="ref28"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Namgay</surname><given-names>P.</given-names></name> <name><surname>Wangdi</surname><given-names>P.</given-names></name> <name><surname>Thinley</surname><given-names>S.</given-names></name></person-group> (<year>2022</year>). <source>Designing and developing a data science programme in Bhutan in 2022 IEEE Frontiers in education conference (FIE)</source>. <publisher-loc>Sweden</publisher-loc>: <publisher-name>IEEE</publisher-name>.</citation></ref>
<ref id="ref29"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll4">North-West University</collab></person-group> (<year>2025a</year>) Faculty of Natural and Agricultural Sciences undergraduate yearbook. Availabel online at: <ext-link xlink:href="https://studies.nwu.ac.za/studies/yearbooks" ext-link-type="uri">https://studies.nwu.ac.za/studies/yearbooks</ext-link> (Accessed July 22, 2025).</citation></ref>
<ref id="ref30"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll5">North-West University</collab></person-group>. (<year>2025b</year>). The NWU rankings. Available online at: <ext-link xlink:href="https://www.nwu.ac.za/rankings" ext-link-type="uri">https://www.nwu.ac.za/rankings</ext-link> (Accessed July 22, 2025).</citation></ref>
<ref id="ref31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ortiz-Lozano</surname><given-names>J. M.</given-names></name> <name><surname>Rua-Vieites</surname><given-names>A.</given-names></name> <name><surname>Bilbao-Calabuig</surname><given-names>P.</given-names></name> <name><surname>Casades&#x00FA;s-Fa</surname><given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>University student retention: best time and data to identify undergraduate students at-risk of dropout</article-title>. <source>Innov. Educ. Teach. Int.</source> <volume>57</volume>, <fpage>74</fpage>&#x2013;<lpage>85</lpage>. doi: <pub-id pub-id-type="doi">10.1080/14703297.2018.1502090</pub-id></citation></ref>
<ref id="ref32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pek</surname><given-names>R. Z.</given-names></name> <name><surname>&#x00D6;zyer</surname><given-names>S. T.</given-names></name> <name><surname>Elhage</surname><given-names>T.</given-names></name> <name><surname>&#x00D6;zyer</surname><given-names>T.</given-names></name> <name><surname>Alhajj</surname><given-names>R.</given-names></name></person-group> (<year>2022</year>). <article-title>The role of machine learning in identifying students at-risk and minimizing failure</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>1224</fpage>&#x2013;<lpage>1243</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3232984</pub-id></citation></ref>
<ref id="ref33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>P&#x00E9;rez</surname><given-names>L. X.</given-names></name></person-group> (<year>1998</year>). <article-title>Sorting, supporting, connecting, and transforming: intervention strategies for students at risk</article-title>. <source>Community Coll. Rev.</source> <volume>26</volume>, <fpage>63</fpage>&#x2013;<lpage>78</lpage>. doi: <pub-id pub-id-type="doi">10.1177/009155219802600105</pub-id></citation></ref>
<ref id="ref34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pillay</surname><given-names>A. L.</given-names></name> <name><surname>Ngcobo</surname><given-names>H. S.</given-names></name></person-group> (<year>2010</year>). <article-title>Sources of stress and support among rural-based first-year university students: an exploratory study</article-title>. <source>S. Afr. J. Psychol.</source> <volume>40</volume>, <fpage>234</fpage>&#x2013;<lpage>240</lpage>. doi: <pub-id pub-id-type="doi">10.1177/008124631004000302</pub-id></citation></ref>
<ref id="ref35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Raines</surname><given-names>J. M.</given-names></name></person-group> (<year>2012</year>). <article-title>FirstSTEP: a preliminary review of the effects of a summer bridge program on pre-college STEM majors</article-title>. <source>J. STEM Educ.</source> <volume>13</volume>, <fpage>22</fpage>&#x2013;<lpage>29</lpage>. Availabel online at: <ext-link xlink:href="https://www.jstem.org/jstem/index.php/JSTEM/article/view/1682" ext-link-type="uri">https://www.jstem.org/jstem/index.php/JSTEM/article/view/1682</ext-link></citation></ref>
<ref id="ref36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sarra</surname><given-names>A.</given-names></name> <name><surname>Fontanella</surname><given-names>L.</given-names></name> <name><surname>Di Zio</surname><given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>Identifying students at-risk of academic failure within the educational data mining framework</article-title>. <source>Soc. Indic. Res.</source> <volume>146</volume>, <fpage>41</fpage>&#x2013;<lpage>60</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11205-018-1901-8</pub-id></citation></ref>
<ref id="ref37"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ssempebwa</surname><given-names>J.</given-names></name> <name><surname>Eduan</surname><given-names>W.</given-names></name> <name><surname>Mulumba</surname><given-names>F. N.</given-names></name></person-group> (<year>2012</year>). <article-title>Effectiveness of university bridging programs in preparing students for university education: a case from East Africa</article-title>. <source>J. Stud. Int. Educ.</source> <volume>16</volume>, <fpage>140</fpage>&#x2013;<lpage>156</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1028315311405062</pub-id></citation></ref>
<ref id="ref38"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tishkovskaya</surname><given-names>S.</given-names></name> <name><surname>Lancaster</surname><given-names>G. A.</given-names></name></person-group> (<year>2012</year>). <article-title>Statistics education in the 21st century: a review of challenges, teaching innovations and strategies for reform</article-title>. <source>J. Stat. Educ.</source> <volume>20</volume>:<fpage>641</fpage>. doi: <pub-id pub-id-type="doi">10.1080/10691898.2012.11889641</pub-id></citation></ref>
<ref id="ref39"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll6">United States Bureau of Labor Statistics</collab></person-group>. (<year>2023</year>). Occupational outlook handbook &#x2013; data scientists. Available online at: <ext-link xlink:href="https://www.bls.gov/ooh/math/data-scientists.htm#tab-1" ext-link-type="uri">https://www.bls.gov/ooh/math/data-scientists.htm#tab-1</ext-link> (Accessed August 20, 2024).</citation></ref>
<ref id="ref40"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Veerasamy</surname><given-names>A. K.</given-names></name> <name><surname>D&#x2019;Souza</surname><given-names>D.</given-names></name> <name><surname>Apiola</surname><given-names>M. V.</given-names></name> <name><surname>Laakso</surname><given-names>M. J.</given-names></name> <name><surname>Salakoski</surname><given-names>T.</given-names></name></person-group> (<year>2020</year>) <article-title>Using early assessment performance as early warning signs to identify at-risk students in programming courses</article-title>. <conf-name>2020 IEEE Frontiers in education conference (FIE) (Sweden: IEEE)</conf-name>.</citation></ref>
<ref id="ref41"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Voulo</surname><given-names>M.</given-names></name> <name><surname>Evans</surname><given-names>B.</given-names></name> <name><surname>Hannon</surname><given-names>G.</given-names></name> <name><surname>Longenbach</surname><given-names>S.</given-names></name> <name><surname>Spaen</surname><given-names>B.</given-names></name></person-group> (<year>2024</year>). Data science degree programs. Available online at: <ext-link xlink:href="https://www.datascienceprograms.org/" ext-link-type="uri">https://www.datascienceprograms.org/</ext-link> (Accessed August 20, 2024).</citation></ref>
<ref id="ref42"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zakari</surname><given-names>I. S.</given-names></name></person-group> (<year>2020</year>). <article-title>Promoting statistics in the era of data science and data-driven innovations</article-title>. <source>Stat. Educ. Res. J.</source> <volume>19</volume>, <fpage>226</fpage>&#x2013;<lpage>237</lpage>. doi: <pub-id pub-id-type="doi">10.52041/serj.v19i1.132</pub-id></citation></ref>
</ref-list>
</back>
</article>