<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioeng. Biotechnol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Bioengineering and Biotechnology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioeng. Biotechnol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-4185</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1631910</article-id>
<article-id pub-id-type="doi">10.3389/fbioe.2025.1631910</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Deep multimodal biomechanical analysis for lower back pain rehabilitation to improve patients stability</article-title>
<alt-title alt-title-type="left-running-head">Ashraf et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbioe.2025.1631910">10.3389/fbioe.2025.1631910</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Ashraf</surname>
<given-names>Muhammad Abrar</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3187762"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Wu</surname>
<given-names>Yanfeng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Najam</surname>
<given-names>Shaheryar</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3160704"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Alshehri</surname>
<given-names>Mohammed</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>AlQahtani</surname>
<given-names>Yahya</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Aljuaid</surname>
<given-names>Hanan</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes">
<name>
<surname>Jalal</surname>
<given-names>Ahmad</given-names>
</name>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
<xref ref-type="aff" rid="aff8">
<sup>8</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2629360"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes">
<name>
<surname>Liu</surname>
<given-names>Hui</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff9">
<sup>9</sup>
</xref>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1257413"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Guodian Nanjing Automation Co., Ltd</institution>, <city>Nanjing</city>, <country country="CN">China</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Department of Electrical and Computer Engineering Riphah International University</institution>, <city>Islamabad</city>, <country country="PK">Pakistan</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>Department of Electrical Engineering, Bahria University</institution>, <city>Islamabad</city>, <country country="PK">Pakistan</country>
</aff>
<aff id="aff4">
<label>4</label>
<institution>Department of Computer Science, King Khalid University</institution>, <city>Abha</city>, <country country="SA">Saudi Arabia</country>
</aff>
<aff id="aff5">
<label>5</label>
<institution>Department of Informatics and Computer Systems, King Khalid University</institution>, <city>Abha</city>, <country country="SA">Saudi Arabia</country>
</aff>
<aff id="aff6">
<label>6</label>
<institution>Computer Sciences Department, College of Computer and Information Sciences, Princess Nourah bint Abdulrahman University (PNU)</institution>, <city>Riyadh</city>, <country country="SA">Saudi Arabia</country>
</aff>
<aff id="aff7">
<label>7</label>
<institution>Faculty of Computer Science, Air University</institution>, <city>Islamabad</city>, <country country="PK">Pakistan</country>
</aff>
<aff id="aff8">
<label>8</label>
<institution>Department of Computer Science and Engineering, College of Informatics, Korea University</institution>, <city>Seoul</city>, <country country="KR">Republic of Korea</country>
</aff>
<aff id="aff9">
<label>9</label>
<institution>School of Future Technology, Nanjing University of Information Science and Technology</institution>, <city>Nanjing</city>, <country country="CN">China</country>
</aff>
<aff id="aff10">
<label>10</label>
<institution>Cognitive Systems Lab, University of Bremen</institution>, <city>Bremen</city>, <country country="DE">Germany</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Hui Liu, <email xlink:href="hui.liu@uni-bremen.de">hui.liu@uni-bremen.de</email>; Ahmad Jalal, <email xlink:href="ahmadjalal@mail.au.edu.pk">ahmadjalal@mail.au.edu.pk</email>
</corresp>
<fn fn-type="equal" id="fn001">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-11-07">
<day>07</day>
<month>11</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1631910</elocation-id>
<history>
<date date-type="received">
<day>20</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>17</day>
<month>09</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Ashraf, Wu, Najam, Alshehri, AlQahtani, Aljuaid, Jalal and Liu.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Ashraf, Wu, Najam, Alshehri, AlQahtani, Aljuaid, Jalal and Liu</copyright-holder>
<license>
<ali:license_ref start_date="2025-11-07">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Advancements in artificial intelligence are transforming rehabilitation by enabling scalable, patient-centric solutions within modern healthcare systems. This study introduces 3D-PoseFormer, a deep multimodal framework designed for the telerehabilitation of individuals with lower back pain (LBP).</p>
</sec>
<sec>
<title>Methods</title>
<p>The proposed system performs automated data acquisition using synchronized RGB and depth video streams to enable real-time, markerless, and sensor-free analysis of physiotherapy exercises. From the depth sensing module, 3D body joint positions are extracted and used to generate SMPL-based mesh vertices for detailed biomechanical analysis and postural representation. Simultaneously, RGB frames are processed using keypoint detection algorithms&#x2014;Shi-Tomasi, AKAZE, BRISK, SIFT, and Harris corner detection. Extracted features are enhanced through semantic contour analysis of segmented body parts to capture localized appearance-based information relevant to LBP therapy. The fused multimodal features are then passed to a Transformer-based machine learning model that captures temporal motion patterns for accurate exercise classification and human intention recognition.</p>
</sec>
<sec>
<title>Results</title>
<p>The system removes the need for wearable sensors and supports autonomous, continuous monitoring in home-based rehabilitation. Validation on the KIMORE dataset (baseline, including rehabilitation exercises by patients with lower back pain), mRI dataset (rehabilitation exercises), and UTKinect-Action3D dataset (comprising diverse subjects and activity scenarios) achieved state-of-the-art accuracies of 94.73%, 91%, and 94.2%, respectively.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Results demonstrate the robustness, generalizability, and clinical potential of 3D-PoseFormer in AI-assisted rehabilitation, offering a scalable and intelligent healthcare system for remote physiotherapy and patient monitoring.</p>
</sec>
</abstract>
<kwd-group>
<kwd>rehabilitation</kwd>
<kwd>data acquisition</kwd>
<kwd>depth sensing</kwd>
<kwd>biomechanical analysis</kwd>
<kwd>machine learning</kwd>
<kwd>intention recognition</kwd>
<kwd>healthcare system</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. The APC was funded by the Open Access Initiative of the University of Bremen and the DFG via SuUB Bremen. This work was supported through Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2025R54), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia. The authors extend their appreciation to the Deanship of Research and Graduate Studies at King Khalid University for funding this work through Large Group Project under grant number (RGP2/595/46).</funding-statement>
</funding-group>
<counts>
<fig-count count="14"/>
<table-count count="12"/>
<equation-count count="38"/>
<ref-count count="55"/>
<page-count count="21"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Biosensors and Biomolecular Electronics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Lower back pain (LBP) is a prevalent musculoskeletal disorder that affects spinal posture, mobility, and quality of life. Rehabilitation for LBP often demands sustained physiotherapy involving repetitive, supervised exercises. However, conventional rehabilitation requires regular clinical visits, posing accessibility barriers for individuals in remote, rural, or resource-constrained settings. Moreover, unsupervised home exercises risk incorrect execution, potentially worsening patient outcomes. Furthermore, recent advancements in artificial intelligence and computer vision have enabled promising alternatives for automated rehabilitation. Yet, most image-based systems struggle with real-world challenges such as occlusion, appearance variation, depth ambiguity, and dependency on camera viewpoints (<xref ref-type="bibr" rid="B44">Wang et al., 2011</xref>; <xref ref-type="bibr" rid="B9">Cao et al., 2017</xref>). These limitations hinder accurate biomechanical analysis and compromise the reliability of unsupervised assessment in home environments.</p>
<p>Recent rehabilitation research has also emphasized cross-dimensional multimodal assessment, where visual modalities are fused with physiological electrical signals to enhance clinical reliability. For example (<xref ref-type="bibr" rid="B5">Ao et al., 2023</xref>), employed sEMG-based muscle synergy analysis for gesture recognition, demonstrating the value of electrophysiological cues. Related studies have shown that combining vision with surface EMG improves motor intention decoding (<xref ref-type="bibr" rid="B54">Zhang et al., 2019</xref>) and supports more accurate rehabilitation monitoring (<xref ref-type="bibr" rid="B46">Xia et al., 2020</xref>). While these multimodal approaches show promise for enhanced accuracy, they introduce practical challenges including increased hardware complexity, user compliance issues with wearable sensors, and potential discomfort during extended use. Consequently, vision-only systems remain valuable for applications requiring non-invasive deployment, minimal infrastructure requirements, and broader accessibility across diverse user populations and environmental conditions.</p>
<p>To address these challenges, we propose 3D-PoseFormer, a deep multimodal framework tailored for remote lower back pain rehabilitation. The system leverages RGB and depth video streams to enable real-time physiotherapy analysis without wearable sensors or physical markers. From depth images, we extract 3D joint locations and reconstruct full-body meshes using the Skinned Multi-Person Linear (SMPL) model (<xref ref-type="bibr" rid="B29">Loper et al., 2015</xref>), capturing precise pose and shape parameters. Concurrently, RGB images are processed via classical keypoint detectors (Shi-Tomasi, AKAZE, BRISK, SIFT, Harris) and enhanced through semantic contour extraction to localize anatomical features. These complementary features are fused into a unified representation. These complementary features are fused into a unified representation and passed to a Transformer-based architecture (<xref ref-type="bibr" rid="B43">Vaswani et al., 2017</xref>; <xref ref-type="bibr" rid="B28">Liu et al., 2022</xref>). This approach effectively models temporal dynamics for robust exercise classification and correctness evaluation.</p>
<p>The proposed 3D-PoseFormer directly addresses prior limitations by using depth-based 3D joint reconstruction and mesh modelling to handle occlusion and appearance variation, while multimodal RGB-D inputs with Transformer-based temporal modelling mitigate depth ambiguity and viewpoint dependency. These design choices ensure robust rehabilitation analysis in realistic scenarios. The framework integrates structural body modelling, appearance cues, and temporal context without requiring physical markers and complements clinician supervision.</p>
<p>We evaluate our system on three public datasets KIMORE (<xref ref-type="bibr" rid="B57">Capecci et al., 2019</xref>), mRI, and UTKinect-Action3D achieving state-of-the-art performance (94.73%, 91.0%, and 94.2%, respectively), thereby demonstrating its generalizability and clinical relevance.</p>
<p>Main contributions of this work are as follows.<list list-type="bullet">
<list-item>
<p>We present a novel rehabilitation framework free from external sensors or physical markers, combining depth-based 3D mesh reconstruction with RGB-based 2D keypoint extraction for accurate biomechanical assessment.</p>
</list-item>
<list-item>
<p>We propose DKP-Net-24, a novel keypoint extraction framework for robust 3D keypoint estimation from depth-based silhouettes under varying arm poses. It employs specialized image processing pipelines to adapt dynamically to different body alignments, ensuring reliable motion tracking for rehabilitation assessment.</p>
</list-item>
<list-item>
<p>We propose a unique feature fusion technique that combines 2D and 3D keypoints, integrating 2D appearance features, 3D mesh geometry, and semantic contours into a unified feature vector, coupled with Transformer-based classification.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Literature review</title>
<p>In the domain of exercise assessment and recognition, a wide variety of technologies have been explored. Inertial Measurement Units (IMUs) are one of the most commonly used tools due to their portability and ability to capture fine-grained motion data. <xref ref-type="bibr" rid="B3">&#x015E;ahin (2024)</xref> reviewed wearable technologies in physiotherapy and rehabilitation, highlighting their applications in monitoring movement, sleep, and managing chronic health conditions. Despite the promising results, the need to wear multiple sensors can reduce practicality and user comfort in non-clinical environments.</p>
<p>
<xref ref-type="bibr" rid="B64">Gumaei et al. (2019)</xref> proposed a hybrid deep learning model combining SRUs and GRUs for multimodal wearable sensor&#x2013;based human activity recognition, achieving 99.80% accuracy on the MHEALTH dataset and about 95.70% in 10-fold cross-validation. <xref ref-type="bibr" rid="B10">Chang et al. (2011)</xref> used Kinect for posture coaching, achieving 91.9% accuracy in pose classification and 93.75% in trajectory recognition, but faced issues with low light and cluttered backgrounds. <xref ref-type="bibr" rid="B47">Yang et al. (2012)</xref> achieved 85% accuracy for gait-based exercise recognition with Kinect, but performance dropped when users were occluded or faced away. <xref ref-type="bibr" rid="B56">Barabas et al. (2019)</xref> developed a Kinect-based platform for monitoring elderly movements and detecting falls in indoor settings, achieving approximately 92% accuracy in fall detection.</p>
<p>Recent works have also explored telerehabilitation and intelligent exercise monitoring using multimodal and sensor-based approaches. <xref ref-type="bibr" rid="B6">Ashraf et al. (2025)</xref> proposed a telerehabilitation system for elderly healthcare using physical exercise monitoring, while <xref ref-type="bibr" rid="B7">Awan et al. (2024)</xref> introduced a robust exercise-based telerehabilitation framework tailored for elderly healthcare services. <xref ref-type="bibr" rid="B41">Tayyab and Jalal (2025)</xref> developed a machine learning&#x2013;based system for disabled rehabilitation monitoring and healthcare recognition. Similarly, <xref ref-type="bibr" rid="B1">Akhter et al. (2023)</xref> presented a deep skeleton modeling approach with hybrid hand-crafted cues for exercise recognition. <xref ref-type="bibr" rid="B14">Fatima et al. (2024)</xref> designed a feature extraction strategy combining full-body and geometric features for sports interaction recognition, whereas <xref ref-type="bibr" rid="B68">Nadeem et al. (2020)</xref> applied multidimensional features and a Markov model for accurate physical activity recognition in smart health fitness. <xref ref-type="bibr" rid="B65">Afsar et al. (2023)</xref> employed deep learning models with body-worn sensors for sports activity recognition in exergaming, complemented by studies such as <xref ref-type="bibr" rid="B67">Khan et al. (2025)</xref>, <xref ref-type="bibr" rid="B63">Javeed and Chelloug (2022)</xref>, and <xref ref-type="bibr" rid="B66">Kaynat et al. (2025)</xref>, who applied artificial neural networks, gesture recognition, and dynamic features for immersive fitness and wearable-sensor&#x2013;based exergaming systems. <xref ref-type="bibr" rid="B42">Tayyab et al. (2025)</xref> proposed a hybrid deep learning approach combining key body descriptors for sports activity recognition, while <xref ref-type="bibr" rid="B33">Nazar and Jalal (2025)</xref> developed wearable sensor&#x2013;based activity classification methods for intelligent healthcare monitoring. Furthermore, <xref ref-type="bibr" rid="B21">Kamal et al. (2025)</xref> proposed a holistic pose estimation and dynamic motion analysis framework for telerehabilitation of physically disabled individuals, demonstrating the potential of deep models in clinically relevant rehabilitation systems.</p>
<p>More recently, RGB-based approaches have gained attention due to their non-intrusive and sensor less nature. Gupta et al. (<xref ref-type="bibr" rid="B16">Gupta et al., 2020</xref>) reviewed various RGB video-based human activity recognition models, where several architectures such as 3D CNNs and LSTMs reached 80%&#x2013;85% accuracy on different movement datasets. However, the sensitivity of RGB approaches to illumination changes, camera placement, and background noise remains a significant challenge. Li et al. (<xref ref-type="bibr" rid="B27">Li et al., 2021</xref>) developed an action recognition system using RGB video and graph convolution networks, achieving 82.4% accuracy, but struggled with frame drops and keypoint inaccuracies under occlusion. <xref ref-type="bibr" rid="B61">Aubry et al. (2019)</xref> proposed an action recognition approach using 2D skeletons extracted from RGB videos and CNN-based classification, achieving 83.32% (cross-subject) and 88.78% (cross-view) accuracies on the NTU RGB+D dataset with ResNet. <xref ref-type="bibr" rid="B60">Xu et al. (2021)</xref> developed a dual-stream model integrating scene images with human skeleton data for action recognition, achieving 94.10% accuracy on benchmark datasets. However, real-world robustness was limited.</p>
<p>
<xref ref-type="bibr" rid="B59">Hamdy et al. (2024)</xref> proposed a transformer-based model for classifying rehabilitation exercises, achieving 91.96% accuracy to enhance physical therapy assessment and monitoring. However, performance dropped when joint extraction was inaccurate. Recently, 3D human reconstruction methods, especially those using the SMPL model, have shown promise in exercise assessment, providing a detailed understanding of body movement beyond traditional 2D or depth-based methods. <xref ref-type="bibr" rid="B52">Zanfir et al. (2018)</xref> developed a 3D pose estimation pipeline using SMPL-based reconstruction, achieving 87.2% accuracy in fitness activity analysis. <xref ref-type="bibr" rid="B22">Kanazawa et al. (2018)</xref> introduced Human Mesh Recovery (HMR), using SMPL for 3D pose estimation from a single RGB image, laying the foundation for marker less 3D exercise assessment. <xref ref-type="bibr" rid="B62">Saqlain et al. (2022)</xref> introduced 3DMesh-GAR, a 3D human body mesh-based approach for group activity recognition from RGB frames, achieving 93.6% accuracy on the Collective Activity Dataset.</p>
<p>
<xref ref-type="bibr" rid="B24">Kocabas et al. (2020)</xref> introduced VIBE, which generates temporally coherent SMPL parameters, achieving 86.3% accuracy in action recognition despite motion blur and occlusions. <xref ref-type="bibr" rid="B35">Pavlakos et al. (2017)</xref> used volumetric prediction of 3D meshes for activity recognition, reaching 83.7% accuracy in gesture-based fitness datasets. These studies underline the growing relevance of 3D reconstruction techniques, particularly those involving SMPL, in advancing the field of exercise assessment. By capturing pose and shape in a camera-invariant and rotation-robust format, SMPL opens new avenues for tele-rehabilitation, automated posture correction, and non-intrusive fitness coaching.</p>
</sec>
<sec sec-type="methods" id="s3">
<label>3</label>
<title>Methodology</title>
<p>The methodology has two phases: RGB-Keypoint Detection (RGB-KPD), where RGB images were processed to estimate the human pose using keypoints detection algorithms. The second phase is Depth-based Mesh Generation (D-Mesh). In the depth image processing phase, the proposed system extracts human silhouettes and detects 3D body joint positions from depth images. These are then passed to the SMPL model to generate detailed 3D body mesh vertices, along with pose and shape parameters. In the RGB-KPD phase, RGB images are processed to extract complementary visual features. Silhouettes are analyzed using multiple keypoint detection techniques, including Shi-Tomasi, AKAZE, BRISK, SIFT, Harris corner, and contour-based analysis. Body part parsing is performed using a pre-trained model, and contour analysis is applied to each segmented part. The features from the RGB and depth streams are fused, and a Transformer-based architecture is used to capture temporal dynamics and assess exercise quality. The workflow is shown on <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>3D-PoseFormer: a deep multimodal pipeline for telerehabilitation of disabled patients via 3D body mesh workflow and part-aware keypoint estimation.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g001.tif">
<alt-text content-type="machine-generated">A comprehensive flowchart illustrates the process of exercise recognition using image and depth data. Starting with RGB and depth frames undergoing preprocessing like resizing and normalization, it includes human silhouette extraction through semantic segmentation. Key point extraction employs various methods, leading to body part labeling. Depth-based semantic segmentation is used for 3D body joint points extraction, followed by 3D mesh reconstruction involving data transformation and joints prediction. The diagram concludes with feature fusion and transformer model integration for exercise recognition. Various visual elements including segmented silhouettes and body joint models enhance the explanation.</alt-text>
</graphic>
</fig>
<sec id="s3-1">
<label>3.1</label>
<title>RGB-KPD</title>
<p>The methodology for extracting human silhouettes from video frames involves three core stages: preprocessing, semantic segmentation, and silhouette extraction. These operations are applied on a per-frame basis, assuming that frames have already been extracted from the input videos.</p>
<sec id="s3-1-1">
<label>3.1.1</label>
<title>Preprocessing</title>
<p>Each input frame, denoted as <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> are pixel coordinates, is initially converted from BGR to RGB color space. It is then resized to match the input dimensions expected by the semantic segmentation model, represented as <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. The resized image is normalized using the mean and standard deviation statistics of the model&#x2019;s training dataset. Mathematically, this transformation can be written as in <xref ref-type="disp-formula" rid="e1">Equation 1</xref>.<disp-formula id="e1">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>G</mml:mi>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where I represent the input frame. (x, y) are the pixel coordinates in the resized image of dimensions H &#xd7; W, <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> are the corresponding pixel coordinates in the original BGR image after resizing to H &#xd7; W, <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>G</mml:mi>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the RGB color value at pixel (x&#x2032;,y&#x2032;) after the BGR to RGB conversion and resizing, &#x3bc; represents the mean vector (or scalar if it&#x27;s a per-channel mean) calculated from the model&#x2019;s training dataset and &#x3c3; represents the standard deviation vector (or scalar if it&#x27;s a per-channel standard deviation) calculated from the model&#x2019;s training dataset. This normalized image is then converted into a tensor as in <xref ref-type="disp-formula" rid="e2">Equation 2</xref>.<disp-formula id="e2">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>z</mml:mi>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
</sec>
<sec id="s3-1-2">
<label>3.1.2</label>
<title>Semantic segmentation</title>
<p>The tensor <inline-formula id="inf6">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is passed through a DeepLabV3 (<xref ref-type="bibr" rid="B17">Hamamoto et al., 2024</xref>) segmentation model with a ResNet-101 (<xref ref-type="bibr" rid="B34">Panigrahi et al., 2024</xref>) backbone to produce a pixel-wise segmentation map. The model outputs a probability distribution for each pixel given in <xref ref-type="disp-formula" rid="e3">Equation 3</xref>.<disp-formula id="e3">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>V</mml:mi>
<mml:mn>3</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>To assign a class label to each pixel, the class with the highest probability is selected using the argmax operation given by <xref ref-type="disp-formula" rid="e4">Equation 4</xref>.<disp-formula id="e4">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>arg</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>&#x3f5;</mml:mi>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where arg max selects the index n of the highest probability in the vector <inline-formula id="inf7">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> corresponding to the class label assigned to that pixel. Following this, a binary mask <inline-formula id="inf8">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is generated by analyzing all regions in <inline-formula id="inf9">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Among all the segmented regions, only the largest connected component is retained, ensuring that the most prominent human figure in the frame is selected using <xref ref-type="disp-formula" rid="e5">Equation 5</xref>.<disp-formula id="e5">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>255</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
</sec>
<sec id="s3-1-3">
<label>3.1.3</label>
<title>Silhouette extraction</title>
<p>To refine the extracted mask, morphological operations are applied to <inline-formula id="inf10">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Specifically, opening and closing operations are performed using a kernel <inline-formula id="inf11">
<mml:math id="m16">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of size <inline-formula id="inf12">
<mml:math id="m17">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to remove small artifacts and fill small holes given by <xref ref-type="disp-formula" rid="e6">Equation 6</xref>.<disp-formula id="e6">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2218;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x22C5;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2296;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2295;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf13">
<mml:math id="m19">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2296;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2295;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2295;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2296;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The cleaned mask is then resized back to the original frame dimensions <inline-formula id="inf14">
<mml:math id="m20">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> for accurate alignment using <xref ref-type="disp-formula" rid="e7">Equation 7</xref>.<disp-formula id="e7">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>.</mml:mo>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
<mml:mi>W</mml:mi>
</mml:mfrac>
<mml:mo>&#x230b;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x230a;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>.</mml:mo>
<mml:msup>
<mml:mi>H</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
<mml:mi>H</mml:mi>
</mml:mfrac>
<mml:mo>&#x230b;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where &#x230a;&#x22c5;&#x230b; denotes the floor function (used in nearest-neighbor interpolation), x&#x2208;[0, W&#x2212;1], y&#x2208;[0, H&#x2212;1]. Finally, the silhouette frame is generated by applying this mask to the original input frame. Only pixels corresponding to the detected human are retained, while all other pixels are set to zero (black background). The final silhouette frame <inline-formula id="inf15">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is computed as in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>.<disp-formula id="e8">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>255</mml:mn>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>This approach ensures the robust isolation of the human figure from each frame, producing clean silhouettes suitable for downstream analysis.</p>
</sec>
<sec id="s3-1-4">
<label>3.1.4</label>
<title>2D keypoints feature extraction</title>
<p>To extract meaningful structural keypoints from binary human silhouettes, we employed a suite of classical keypoint detection techniques rooted in image geometry and intensity discontinuity. Each method targets distinct properties of the silhouette and collectively offers a diverse spatial representation of the human form across varying poses.</p>
<sec id="s3-1-4-1">
<label>3.1.4.1</label>
<title>Contour-based keypoints</title>
<p>Contour approximation detects the outer boundary of a shape and simplifies it into a polygonal representation. As illustrated in <xref ref-type="fig" rid="F2">Figure 2a</xref>, this method localizes keypoints along the silhouette&#x2019;s perimeter, concentrating on high-curvature regions such as elbows, knees, and shoulder angles. By adjusting the approximation tolerance, the method effectively balances geometric precision and sparsity, resulting in a reduced set of anatomically relevant points. The polygonal simplification is governed by the Douglas&#x2013;Peucker algorithm (<xref ref-type="bibr" rid="B13">Douglas and Peucker, 1973</xref>), which recursively removes points where the perpendicular distance <inline-formula id="inf16">
<mml:math id="m24">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x22a5;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> to the baseline segment is below a specified threshold &#x3b5; using <xref ref-type="disp-formula" rid="e9">Equation 9</xref>.<disp-formula id="e9">
<mml:math id="m25">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x22a5;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Keypoint detection on silhouette images using <bold>(a)</bold> Contour approximation, <bold>(b)</bold> Harris corner detection, and <bold>(c)</bold> Shi-Tomasi method.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g002.tif">
<alt-text content-type="machine-generated">Three images show silhouette outlines of humanoid figures with highlighted points. (a) and (b) have red points marking the contour, while (c) has green points. Each figure is set against a black background.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-1-4-2">
<label>3.1.4.2</label>
<title>Harris corner detection</title>
<p>Harris corner detection identifies regions with strong local intensity variations, mathematically characterized by eigenvalue analysis of the gradient covariance matrix (<xref ref-type="bibr" rid="B18">Harris and Stephens, 1988</xref>). In silhouette imagery, these variations are caused by shape discontinuities. As shown in <xref ref-type="fig" rid="F2">Figure 2b</xref>, Harris keypoints are densely distributed around joint regions, including wrists, ankles, and neck transitions, where the silhouette contour exhibits abrupt directional changes. The corner response function is defined as in <xref ref-type="disp-formula" rid="e10">Equation 10</xref>.<disp-formula id="e10">
<mml:math id="m26">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>det</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>.</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>where <inline-formula id="inf17">
<mml:math id="m27">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the second-moment matrix given in <xref ref-type="disp-formula" rid="e11">Equation 11</xref>.<disp-formula id="e11">
<mml:math id="m28">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mtable columnalign="center">
<mml:mtr>
<mml:mtd>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>X</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>X</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>and <inline-formula id="inf18">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>X</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are image gradients along the x and y directions, respectively.</p>
</sec>
<sec id="s3-1-4-3">
<label>3.1.4.3</label>
<title>Shi-Tomasi corner detection</title>
<p>Shi-Tomasi enhances the Harris approach by retaining only those points with the highest structural stability, as defined by the minimum eigenvalue of the autocorrelation matrix (<xref ref-type="bibr" rid="B39">Shi and Tomasi, 1994</xref>). As visualized in <xref ref-type="fig" rid="F2">Figure 2c</xref>, this results in spatially clean and anatomically consistent points located primarily at prominent body joints.</p>
</sec>
<sec id="s3-1-4-4">
<label>3.1.4.4</label>
<title>ORB-based detection</title>
<p>Oriented FAST and Rotated BRIEF (ORB) utilizes intensity difference testing over a circular neighborhood to identify stable keypoints (<xref ref-type="bibr" rid="B37">Rublee et al., 2011</xref>). As seen in <xref ref-type="fig" rid="F3">Figure 3a</xref>, the detected points consistently emerge at limb extremities, head contours, and joint areas. ORB is particularly effective at capturing repeated spatial patterns across multiple poses, making it well-suited for silhouette-based action analysis.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Keypoint detection results using <bold>(a)</bold> ORB, <bold>(b)</bold> BRISK, <bold>(c)</bold> SIFT, and <bold>(d)</bold> AKAZE on binary human silhouettes.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g003.tif">
<alt-text content-type="machine-generated">Silhouettes of two figures with different dot configurations are shown in four panels, labeled (a) to (d). Panels (a) and (b) feature blue and yellow dots, while panels (c) and (d) show yellow and orange dots, illustrating variations in dot density and arrangement around the figures.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-1-4-5">
<label>3.1.4.5</label>
<title>BRISK-based detection</title>
<p>Binary Robust Invariant Scalable Keypoints (BRISK) identifies local extrema by comparing intensity patterns across concentric circular layers (<xref ref-type="bibr" rid="B26">Leutenegger et al., 2011</xref>). <xref ref-type="fig" rid="F3">Figure 3b</xref> demonstrates that BRISK effectively highlights joint-like structures and pose-specific inflection points such as raised hands, bent arms, and inclined postures. The circular sampling design contributes to its ability to adapt to shape deformation and body articulation.</p>
</sec>
<sec id="s3-1-4-6">
<label>3.1.4.6</label>
<title>SIFT-based detection</title>
<p>SIFT (Scale-Invariant Feature Transform) identifies keypoints by locating extrema in scale-normalized Difference-of-Gaussian space (<xref ref-type="bibr" rid="B30">Lowe, 2004</xref>). Despite the lack of texture in silhouette images, the method succeeds in capturing stable points at scale-consistent curvature zones. As seen in <xref ref-type="fig" rid="F3">Figure 3c</xref>, SIFT keypoints predominantly lie along the outer edges, providing a compact yet descriptive summary of the silhouette geometry. The scale-space extrema are located by solving using <xref ref-type="disp-formula" rid="e12">Equation 12</xref>.<disp-formula id="e12">
<mml:math id="m30">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>where <inline-formula id="inf19">
<mml:math id="m31">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf20">
<mml:math id="m32">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the Gaussian-blurred image at scale <inline-formula id="inf21">
<mml:math id="m33">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s3-1-4-7">
<label>3.1.4.7</label>
<title>AKAZE-based detection</title>
<p>Accelerated KAZE (AKAZE) operates in nonlinear scale space and extracts robust keypoints even under low contrast (<xref ref-type="bibr" rid="B2">Alcantarilla et al., 2011</xref>). Its performance on silhouette data is illustrated in <xref ref-type="fig" rid="F3">Figure 3d</xref>, where keypoints are clustered around the torso and limbs. The method adapts well to body articulation and provides enhanced sensitivity to localized structural transitions.</p>
</sec>
<sec id="s3-1-4-8">
<label>3.1.4.8</label>
<title>Skeleton-based landmark detection</title>
<p>To extract topological keypoints, we applied skeletonization to reduce each silhouette to its medial axis (<xref ref-type="bibr" rid="B53">Zhang and Suen, 1984</xref>). Endpoints and branch points were identified by analyzing the neighborhood connectivity of skeletal pixels. As shown in <xref ref-type="fig" rid="F4">Figure 4</xref>, this method reliably identifies semantically meaningful regions such as fingertips, feet, and limb-torso junctions, offering a structural representation aligned with human pose semantics.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Skeleton-based keypoints showing endpoints (green) and branch points (red) on human silhouettes.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g004.tif">
<alt-text content-type="machine-generated">Two stick figure diagrams with red joints and green extremities. The left figure has arms spread horizontally, while the right figure has arms raised diagonally. Both have a similar lower body structure.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s3-1-5">
<label>3.1.5</label>
<title>Body part labelling</title>
<p>To derive a semantically rich, region-specific understanding of the human body, we employed a body part labeling (BPL) approach using the Single-Human-Parsing-LIP (<xref ref-type="bibr" rid="B19">Huang and Yang, 2024</xref>) model proposed by Huang et al. This model, based on a deep convolutional encoder-decoder framework,\ performs dense pixel-wise classification across twenty predefined body parts including limbs, torso, and accessories. Due to its relatively lightweight architecture and efficient inference capability, it serves as a computationally economical solution well-suited for large-scale or resource-constrained deployments.</p>
<p>The model was applied on preprocessed silhouette frames to generate multi-class segmentation masks where each pixel is mapped to a corresponding anatomical region. Specifically, the model produces a per-pixel probability distribution <inline-formula id="inf22">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msup>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the final label map <inline-formula id="inf23">
<mml:math id="m35">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is obtained by <xref ref-type="disp-formula" rid="e13">Equation 13</xref>.<disp-formula id="e13">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mi>argmax</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:munder>
<mml:msup>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>where <inline-formula id="inf24">
<mml:math id="m37">
<mml:mrow>
<mml:msup>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the predicted probability of class ccc at pixel <inline-formula id="inf25">
<mml:math id="m38">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. As shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, each segmented region is color-coded for visual clarity, facilitating subsequent part-wise analysis. We developed a color-guided contour extraction method using the semantic label map from the parsing model. Each body part was isolated with its unique color, followed by binary masking and intensity thresholding for boundary extraction. Contours were traced with a point-based algorithm and visualized with color-coded hexagonal markers to represent anatomical regions, as shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. This approach effectively captures geometric structures aligned with human anatomy.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Semantic segmentation results using the LIP model for different upper body poses: <bold>(a)</bold> Arms extended at chest level, <bold>(b)</bold> Arms extended at head level, <bold>(c)</bold> Arms raised above the head.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g005.tif">
<alt-text content-type="machine-generated">Three images of a person standing indoors, each showing a blurred face. In the first and second images, the person holds a stick horizontally and raises it above their head. The person wears a red top and black pants. Each image has a corresponding silhouette overlay with segmented body parts in different colors: arms in blue, torso in yellow, and legs in red.</alt-text>
</graphic>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Part-wise contour point visualization across various frames.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g006.tif">
<alt-text content-type="machine-generated">Three abstract human-like figures composed of dots in various colors, on a black background. Each figure&#x27;s body is yellow, legs red, with multicolored dots outlining arms, legs, and head. The figures have different arm positions, suggesting movement.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>D-mesh</title>
<p>In D-Mesh phase, the methodology for processing depth images involves several stages: preprocessing and Dynamic KeyPoint Network (DKP-Net) for keypoint extraction. Preprocessing enhances the image by removing noise, the floor, and improving contrast. DKP-Net extracts 3D keypoints, capturing x, y coordinates, and z-depth. The 3D body joint positions extracted are passed to the Skinned Multi-Person Linear (SMPL) model to generate detailed 3D body mesh vertices, along with pose and shape parameters. DKPNet uses tailored pipelines (DKP-Net-24-L and DKP-Net-24-R) for different arm positions.</p>
<sec id="s3-2-1">
<label>3.2.1</label>
<title>Preprocessing</title>
<p>This study employs the RANSAC algorithm to effectively remove the floor from the depth image by fitting a plane model to the detected floor points. The process starts by identifying floor pixels-based on their depth values and using a binary mask to ensure that only foreground pixels are considered. The depth values are then analyzed to generate a set of 3D points representing the floor, as defined in <xref ref-type="disp-formula" rid="e14">Equation 14</xref>.<disp-formula id="e14">
<mml:math id="m39">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="|" close="" separators="|">
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>255</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where z represents the depth value, and the binary mask is used to differentiate foreground pixels from background pixels. The RANSAC algorithm is then applied to estimate a planar model that best fits the floor points, as described by <xref ref-type="disp-formula" rid="e15">Equation 15</xref>.<disp-formula id="e15">
<mml:math id="m40">
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
</p>
<p>After computing the floor model, points with residuals smaller than a predefined threshold &#x3f5; are identified as floor pixels and removed, as illustrated in <xref ref-type="disp-formula" rid="e16">Equation 16</xref>.<disp-formula id="e16">
<mml:math id="m41">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
</p>
<p>This process effectively removes the floor while maintaining the structural integrity of the other depth values in the image, as depicted in <xref ref-type="fig" rid="F7">Figure 7a</xref>. The depth image, with the floor removed, <inline-formula id="inf26">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mi mathvariant="bold-italic">d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, is then normalized to enhance contrast using Min-Max normalization using <xref ref-type="disp-formula" rid="e17">Equation 17</xref>, as described in <xref ref-type="fig" rid="F7">Figure 7b</xref>.<disp-formula id="e17">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>min</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>max</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>min</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>255</mml:mn>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Preprocessing steps: <bold>(a)</bold> Floor-removed image, <bold>(b)</bold> Normalization, <bold>(c)</bold> CLAHE, and <bold>(d)</bold> Bilateral Filtering.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g007.tif">
<alt-text content-type="machine-generated">Four grayscale infrared images labeled a, b, c, and d show a person with raised arms standing in a hallway. The room contains a chair and other furniture. The images are similar in content with slight variations in contrast and brightness.</alt-text>
</graphic>
</fig>
<p>To further enhance contrast, we apply CLAHE (Contrast Limited Adaptive Histogram Equalization), which improves local contrast while avoiding excessive noise amplification as shown in <xref ref-type="fig" rid="F7">Figure 7c</xref>. The transformation is described by <xref ref-type="disp-formula" rid="e18">Equation 18</xref>, where <inline-formula id="inf27">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the clipped cumulative distribution function used in CLAHE.<disp-formula id="e18">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>
</p>
<p>To make the grayscale depth image compatible with color-based processing techniques, we duplicate the single channel across three channels, as shown in <xref ref-type="disp-formula" rid="e19">Equation 19</xref>.<disp-formula id="e19">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mtext>ch</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
</p>
<p>To preserve edges while minimizing noise, we apply bilateral filtering using <xref ref-type="disp-formula" rid="e20">Equation 20</xref>, as described in as shown in <xref ref-type="fig" rid="F7">Figure 7d</xref>.<disp-formula id="e20">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>where <inline-formula id="inf28">
<mml:math id="m48">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the spatial distance, rrr represents the intensity difference, and <inline-formula id="inf29">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf30">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are Gaussian functions. These preprocessing steps are illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
</sec>
<sec id="s3-2-2">
<label>3.2.2</label>
<title>Deep learning-based human silhouette isolation</title>
<p>To segment objects from depth images, we apply the DeepLabV3&#x2b; segmentation model with a ResNet-101 backbone, as illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>. This model uses Atrous Spatial Pyramid Pooling (ASPP) to capture multi-scale contextual information. The preprocessed image is resized to 256 &#xd7; 256 pixels and then transformed into a tensor, as defined in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>, where <inline-formula id="inf31">
<mml:math id="m51">
<mml:mrow>
<mml:mi mathvariant="script">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the tensor transformation and <inline-formula id="inf32">
<mml:math id="m52">
<mml:mrow>
<mml:mi mathvariant="script">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the resizing operation given by <xref ref-type="disp-formula" rid="e21">Equation 21</xref>.<disp-formula id="e21">
<mml:math id="m53">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="script">T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="script">R</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>
</p>
<p>The transformed image tensor is input into the DeepLabV3&#x2b; model, which produces a pixel-wise segmentation map, as shown in <xref ref-type="disp-formula" rid="e22">Equation 22</xref>.<disp-formula id="e22">
<mml:math id="m54">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>arg</mml:mi>
<mml:munder>
<mml:mi>max</mml:mi>
<mml:mi>C</mml:mi>
</mml:munder>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>
</p>
<p>Morphological operations refine the segmentation mask through resizing, closing, and dilation, ensuring precise segmentation for applications like activity recognition and medical imaging.</p>
</sec>
<sec id="s3-2-3">
<label>3.2.3</label>
<title>Dynamic Keypoint Network &#x2013; 24 points (DKP-Net-24)</title>
<p>DKP-Net-24 (Dynamic Keypoint Network &#x2013; 24 Points) is a robust framework for extracting keypoints from depth-based silhouettes. Unlike static methods, it dynamically adjusts to variations in pose, body alignment, and arm positions, making it ideal for motion tracking and rehabilitation assessment. The system uses two pipelines to extract 3D keypoints (x, y, z) for detailed human motion representation. DKP-Net-24-L handles lowered arms using contour-based analysis, while DKP-Net-24-R is optimized for raised arms, ensuring reliable keypoint detection. The extraction procedure for lowered arms is outlined in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>(DKP-Net-24-L) keypoint detection for lowered arms.</p>
</caption>
<table>
<tbody valign="top">
<tr>
<td align="left">1.&#x2003;procedure MAIN(human_silhouette)</td>
</tr>
<tr>
<td align="left">2.&#x2003;Initialize an empty list for results</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;results &#x2190; []</td>
</tr>
<tr>
<td align="left">3.&#x2003;Set the input silhouette</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;silhouette &#x2190; human_silhouette</td>
</tr>
<tr>
<td align="left">4.&#x2003;Determine the initial position of the head</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;head_x, head_y &#x2190; width/2, top_pixel_y</td>
</tr>
<tr>
<td align="left">5.&#x2003;Set the neck position just below the head</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;neck_x, neck_y &#x2190; width/2, head_y &#x2b; h/8</td>
</tr>
<tr>
<td align="left">6.&#x2003;Find the x-coordinate of the right shoulder at the neck level</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;shoulder_right_x &#x2190; max{x &#x7c; silhouette(neck_y, x) &#x3e; 0} - 5</td>
</tr>
<tr>
<td align="left">7.&#x2003;Find the x-coordinate of the left shoulder at the neck level</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;shoulder_left_x &#x2190; min{x &#x7c; silhouette(neck_y, x) &#x3e; 0} &#x2b; 5</td>
</tr>
<tr>
<td align="left">8.&#x2003;Calculate the center x-coordinate of the neck between the shoulders</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;neck_x &#x2190; avg(shoulder_right_x, shoulder_left_x)</td>
</tr>
<tr>
<td align="left">9.&#x2003;Adjust collarbone positions based on shoulder coordinates</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;collarbone_left_x, collarbone_right_x &#x2190; (shoulder_left_x &#x2b; 15, shoulder_right_x - 15)</td>
</tr>
<tr>
<td align="left">10.&#x2003;Define the starting and ending positions of the hips</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;hip_start, hip_end &#x2190; 3h/5, 2h/3</td>
</tr>
<tr>
<td align="left">11.&#x2003;Find the left and right hip positions</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;hip_left_x, hip_right_x &#x2190; min/max{x &#x7c; silhouette(y, x) &#x3e; 0, y &#x2208; [hip_start, hip_end]}</td>
</tr>
<tr>
<td align="left">12.&#x2003;Calculate the pelvis position</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;pelvis_x, pelvis_y &#x2190; avg(hip_left_x, hip_right_x), avg(hip_start, hip_end)</td>
</tr>
<tr>
<td align="left">13.&#x2003;Determine the center of the spine</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;spine_x, spine_y &#x2190; avg(neck_x, pelvis_x), avg(neck_y, pelvis_y)</td>
</tr>
<tr>
<td align="left">14.&#x2003;Define the upper spine position</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;spine_upper_x, spine_upper_y &#x2190; avg(neck_x, spine_x), avg(neck_y, spine_y)</td>
</tr>
<tr>
<td align="left">15.&#x2003;Define the lower spine position</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;spine_lower_x, spine_lower_y &#x2190; avg(pelvis_x, spine_x), avg(pelvis_y, spine_y)</td>
</tr>
<tr>
<td align="left">16.&#x2003;Find the elbow positions based on the neck and spine range</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;elbow_left_x, elbow_right_x &#x2190; min/max{x &#x7c; silhouette(y, x) &#x3e; 0, y &#x2208; [neck_y, spine_y]}</td>
</tr>
<tr>
<td align="left">17.&#x2003;Find the wrist positions based on the spine and pelvis range</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;wrist_left_x, wrist_right_x &#x2190; min/max{x &#x7c; silhouette(y, x) &#x3e; 0, y &#x2208; [spine_y, pelvis_y]}</td>
</tr>
<tr>
<td align="left">18.&#x2003;Locate the ankle positions at the bottom of the silhouette</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;ankle_left_x, ankle_right_x &#x2190; min/max{x &#x7c; silhouette(bottom_pixel_y, x) &#x3e; 0}</td>
</tr>
<tr>
<td align="left">19.&#x2003;Adjust heel positions slightly lower than the ankles</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;heel_left_y, heel_right_y &#x2190; ankle_left_y - h/40, ankle_right_y - h/40</td>
</tr>
<tr>
<td align="left">20.&#x2003;Display the detected keypoints</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;results &#x2190; DISPLAY_RESULTS(all keypoints)</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;return results</td>
</tr>
<tr>
<td align="left">21.&#x2003;end procedure</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>When the arms are raised above shoulder level, the algorithm described in <xref ref-type="table" rid="T2">Table 2</xref> adapts the keypoint localization process to ensure accurate tracking of the shoulders, wrists, and hand. Detected key body points for different postures are shown in <xref ref-type="fig" rid="F8">Figure 8</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>(DKP-Net-24-R) keypoint detection for raised arms.</p>
</caption>
<table>
<tbody valign="top">
<tr>
<td align="left">1.&#x2003;procedure MAIN(human_silhouette)</td>
</tr>
<tr>
<td align="left">2.&#x2003;Initialize an empty list for results</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;results &#x2190; []</td>
</tr>
<tr>
<td align="left">3.&#x2003;Set the silhouette as input human silhouette</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;silhouette &#x2190; human_silhouette</td>
</tr>
<tr>
<td align="left">4.&#x2003;Define hip start and end positions based on height</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;hip_start, hip_end &#x2190; (3h/5, 2h/3)</td>
</tr>
<tr>
<td align="left">5.&#x2003;Calculate the x-coordinates of the left and right hips</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;hip_left_x, hip_right_x &#x2190; min/max{x &#x7c; silhouette(y, x) &#x3e; 0, y &#x2208; [hip_start, hip_end]}</td>
</tr>
<tr>
<td align="left">6.&#x2003;Calculate the pelvis center coordinates</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;pelvis_x, pelvis_y &#x2190; avg(hip_left_x, hip_right_x), avg(hip_start, hip_end)</td>
</tr>
<tr>
<td align="left">7.&#x2003;Set body center positions for the left and right sides</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;body_center_left, body_center_right &#x2190; hip_left_x, hip_right_x</td>
</tr>
<tr>
<td align="left">8.&#x2003;Find the head position based on the first non-zero pixel above the hips</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;head_x, head_y &#x2190; first nonzero pixel in [0, hip_start]</td>
</tr>
<tr>
<td align="left">9.&#x2003;Calculate the neck position by offsetting the head&#x2019;s position</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;neck_x, neck_y &#x2190; (head_x, head_y &#x2b; h/12)</td>
</tr>
<tr>
<td align="left">10.&#x2003;Identify the x-coordinates of the left and right shoulders at the neck level</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;shoulder_left_x, shoulder_right_x &#x2190; min/max{x &#x7c; silhouette(neck_y, x) &#x3e; 0} &#xb1; 5</td>
</tr>
<tr>
<td align="left">11.&#x2003;Adjust collarbone positions from the shoulders</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;collarbone_left_x, collarbone_right_x &#x2190; (shoulder_left_x &#x2b; 15, shoulder_right_x - 15)</td>
</tr>
<tr>
<td align="left">12.&#x2003;Determine the left and right hand x-coordinates based on silhouette across columns</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;left_hand_x, right_hand_x &#x2190; min/max{x &#x7c; silhouette[:, x].any()}</td>
</tr>
<tr>
<td align="left">13.&#x2003;Set wrist coordinates based on hand positions with a slight vertical offset</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;left_wrist_x, right_wrist_x &#x2190; (left_hand_x, right_hand_x), (left_hand_y &#x2b; 10, right_hand_y &#x2b; 10)</td>
</tr>
<tr>
<td align="left">14.&#x2003;Locate the elbow positions between the neck and spine regions</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;elbow_left_x, elbow_right_x &#x2190; min/max{x &#x7c; silhouette(y, x) &#x3e; 0, y &#x2208; [neck_y, spine_y]}</td>
</tr>
<tr>
<td align="left">15.&#x2003;Calculate the center of the spine by averaging pelvis and neck coordinates</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;spine_x, spine_y &#x2190; avg(neck_x, pelvis_x), avg(neck_y, pelvis_y)</td>
</tr>
<tr>
<td align="left">16.&#x2003;Define the upper spine coordinates between neck and spine centers</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;spine_upper_x, spine_upper_y &#x2190; avg(neck_x, spine_x), avg(neck_y, spine_y)</td>
</tr>
<tr>
<td align="left">17.&#x2003;Locate the knee positions around 3/4 of the height from the top</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;knee_y &#x2190; 3h/4, knee_left_x, knee_right_x &#x2190; min/max{x &#x7c; silhouette(knee_y, x) &#x3e; 0} &#xb1; 7</td>
</tr>
<tr>
<td align="left">18.&#x2003;Find the ankle positions at the bottom of the silhouette</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;ankle_left_x, ankle_right_x &#x2190; min/max{x &#x7c; silhouette(bottom_pixel_y, x) &#x3e; 0}</td>
</tr>
<tr>
<td align="left">19.&#x2003;Adjust heel positions slightly lower than the ankles</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;heel_left_y, heel_right_y &#x2190; ankle_left_y - h/30, ankle_right_y - h/30</td>
</tr>
<tr>
<td align="left">20.&#x2003;Display all detected keypoints</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;results &#x2190; DISPLAY_RESULTS(all keypoints)</td>
</tr>
<tr>
<td align="left">21.&#x2003;Return the final results</td>
</tr>
<tr>
<td align="left">&#x2003;&#x2003;return results</td>
</tr>
<tr>
<td align="left">22.&#x2003;end procedure</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Detected key body points for different postures: <bold>(a)</bold> Neutral stance, <bold>(b)</bold> One-arm relaxed, and <bold>(c)</bold> Arms raised. When the arms are raised above shoulder level, the algorithm described in <xref ref-type="table" rid="T2">Table 2</xref> adapts the keypoint localization process to ensure accurate tracking of the shoulders, wrists, and hand.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g008.tif">
<alt-text content-type="machine-generated">Three panels depict gray silhouettes with red dots marking key body joints. Panel (a) shows a standing posture with arms down. Panel (b) depicts a slightly different stance with one hand on the hip. Panel (c) shows raised arms.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-2-4">
<label>3.2.4</label>
<title>3D mesh reconstruction</title>
<p>In this work, we developed a pipeline for 3D human mesh reconstruction and SMPL model fitting (<xref ref-type="bibr" rid="B29">Loper et al., 2015</xref>) using multiple motion capture datasets. To ensure compatibility across different skeletal formats, we applied a joint mapping strategy that converts our DKP-Net-24 joint extraction into the SMPL structure. These joints are then used to estimate 3D poses and reconstruct body geometry. The SMPL model provides a learned, parametric mesh representation with 6,890 vertices and 13,776 faces, enabling realistic and efficient modeling for animation and analysis.</p>
<p>This model defines the human body through two parameter sets: pose parameters <inline-formula id="inf33">
<mml:math id="m55">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn>72</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which encode 3D axis-angle rotations across 24 joints, and shape parameters <inline-formula id="inf34">
<mml:math id="m56">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn>10</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> which describe identity-specific body shape variations based on a low-dimensional shape space derived from body scan datasets. The SMPL mesh is computed through a blend function that incorporates these parameters using <xref ref-type="disp-formula" rid="e23">Equation 23</xref>.<disp-formula id="e23">
<mml:math id="m57">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(23)</label>
</disp-formula>where <inline-formula id="inf35">
<mml:math id="m58">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the template mesh deformed by shape and pose, <inline-formula id="inf36">
<mml:math id="m59">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents joint locations derived from the shape-dependent skeleton, <inline-formula id="inf37">
<mml:math id="m60">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the linear blend skinning (LBS) function, applying the rotations defined by &#x3b8; using precomputed weights &#x3c9;. The model outputs three key components: the vertex positions <inline-formula id="inf38">
<mml:math id="m61">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>6890</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> that define the surface geometry, the joint positions <inline-formula id="inf39">
<mml:math id="m62">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>24</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> used for pose tracking, and the face connectivity <inline-formula id="inf40">
<mml:math id="m63">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>13776</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> which defines the mesh structure. To align with the SMPL model, the 24 joints are structured as (frames, 24, 3) tensors. A depth inversion corrects orientation, and one-to-one mapping ensures anatomical alignment. SMPL fitting minimizes joint loss by reducing the Euclidean distance between predicted and extracted joints using <xref ref-type="disp-formula" rid="e24">Equation 24</xref>.<disp-formula id="e24">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>24</mml:mn>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mi>&#x3c9;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(24)</label>
</disp-formula>where <inline-formula id="inf41">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c9;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are per-joint weights that control the importance of each joint in the loss calculation. To ensure physiologically plausible poses, a probabilistic prior from a Gaussian Mixture Model (GMM) trained on real motion data is used. It penalizes poses that deviate from natural human movement patterns using <xref ref-type="disp-formula" rid="e25">Equation 25</xref>.<disp-formula id="e25">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(25)</label>
</disp-formula>where p(&#x3b8;) is the GMM likelihood, a regularization term penalizes extreme shape values to ensure realistic body proportions using <xref ref-type="disp-formula" rid="e26">Equation 26</xref>.<disp-formula id="e26">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(26)</label>
</disp-formula>
</p>
<p>A smoothness constraint is added to ensure continuity between frames, reducing jitter by penalizing large joint position changes between consecutive frames using <xref ref-type="disp-formula" rid="e27">Equation 27</xref>.<disp-formula id="e27">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(27)</label>
</disp-formula>where T is the number of frames, Laplacian regularization ensures smooth mesh surfaces by keeping vertices near their neighbors using <xref ref-type="disp-formula" rid="e28">Equation 28</xref>.<disp-formula id="e28">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
</mml:munder>
</mml:mstyle>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(28)</label>
</disp-formula>
</p>
<p>In this expression, <inline-formula id="inf42">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the position of the <italic>i</italic>th vertex and <inline-formula id="inf43">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes its one-ring neighborhood. The overall objective function combines these components, with each term weighted by a corresponding coefficient <inline-formula id="inf44">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to control its influence using <xref ref-type="disp-formula" rid="e29">Equation 29</xref>.<disp-formula id="e29">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>5</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(29)</label>
</disp-formula>
</p>
<p>The loss is minimized with Adam optimization, refining &#x3b8; and &#x3b2; for accurate 3D meshes. <xref ref-type="fig" rid="F9">Figures 9</xref>, <xref ref-type="fig" rid="F10">10</xref> show depth frames and 3D meshes for body poses from KIMORE and UTKinect-Action3D datasets.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>3D reconstruction of a subject from the UTKinect-Action3D dataset: <bold>(a)</bold> Random walk toward the chair <bold>(b)</bold> Leaning forward to pick up an object with both hands closing <bold>(c)</bold> Closing of arms <bold>(d)</bold> Opening the arms after the clap.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g009.tif">
<alt-text content-type="machine-generated">Depth images and corresponding 3D models of a person in four poses labeled (a) through (d). The depth images show silhouettes against a dark background, while the 3D models feature a human figure in varied stances, including relaxed, crouched, and arms outstretched, on a blue background.</alt-text>
</graphic>
</fig>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>3D reconstruction of a subject from the KIMORE dataset: <bold>(a)</bold> Holding a bar with both hands at face level <bold>(b)</bold> Moving the bar to the right side while keeping it in both hands <bold>(c)</bold> Bringing the bar back to the front of the face <bold>(d)</bold> Moving the bar to the left side while keeping it in both hands.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g010.tif">
<alt-text content-type="machine-generated">Four depth images on top show a person with arms extended in various orientations. Below, corresponding 3D models of the person in the same postures are displayed against a blue background, labeled (a) through (d).</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Feature fusion</title>
<p>A feature fusion stage combines RGB and depth modalities. Specifically, let <inline-formula id="inf45">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">k</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mi>P</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represent keypoint features from the <italic>i</italic>th RGB frame, capturing visual cues through methods like contour approximation, corner detection (e.g., Harris, Shi-Tomasi), and feature descriptors (e.g., ORB, SIFT). Furthermore, let <inline-formula id="inf46">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">b</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote body part label features from depth data, encoding segmentation and body region identification. In addition, <inline-formula id="inf47">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">m</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mi>S</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents 3D mesh parameters, including pose and shape, derived from depth data using models like SMPL. The fused feature vector <inline-formula id="inf48">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is computed by concatenating these vectors using <xref ref-type="disp-formula" rid="e30">Equation 30</xref>.<disp-formula id="e30">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">k</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
<mml:mo>&#x2295;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">b</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
<mml:mo>&#x2295;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">m</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(30)</label>
</disp-formula>where &#x2295; represents the concatenation operation, resulting in <inline-formula id="inf49">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>Q</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Fusion of RGB and depth enhances robustness to occlusion, clothing, and viewpoint changes. The fused feature vectors <inline-formula id="inf50">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for a sequence of <italic>M</italic> frames are then assembled into a feature matrix <inline-formula id="inf51">
<mml:math id="m81">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi mathvariant="bold-italic">M</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>Q</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which serves as input to the subsequent temporal modeling stage for exercise recognition.</p>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Transformer-based human action recognition</title>
<p>Given a sequence of fused numerical features for human action recognition, each time step <inline-formula id="inf52">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mi>D</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> encapsulates a combination of modalities such as spatial skeleton data, inertial sensor signals, and appearance features, all merged into a unified vector. This results in an input sequence <inline-formula id="inf53">
<mml:math id="m83">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>T</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where T denotes the number of temporal frames, and each <inline-formula id="inf54">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> carries rich multimodal contextual information. The fused input is projected into a common latent space using a learnable transformation given by <xref ref-type="disp-formula" rid="e31">Equation 31</xref>.<disp-formula id="e31">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(31)</label>
</disp-formula>
</p>
<p>To capture temporal ordering, positional encodings <inline-formula id="inf55">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are added, yielding the input embeddings to the Transformer using <xref ref-type="disp-formula" rid="e32">Equation 32</xref>.<disp-formula id="e32">
<mml:math id="m87">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(32)</label>
</disp-formula>
</p>
<p>The resulting sequence <inline-formula id="inf56">
<mml:math id="m88">
<mml:mrow>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is passed to a stack of Transformer encoder layers, which learn attention-based temporal representations from the fused features. Inside each encoder layer, the Multi-Head Self-Attention mechanism enables the model to weigh interactions between time steps. For each layer, query, key, and value matrices are computed using <xref ref-type="disp-formula" rid="e33">Equation 33</xref>.<disp-formula id="e33">
<mml:math id="m89">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>Z</mml:mi>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>Z</mml:mi>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>Z</mml:mi>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>V</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(33)</label>
</disp-formula>
</p>
<p>The scaled dot-product attention computes dynamic temporal dependencies using <xref ref-type="disp-formula" rid="e34">Equation 34</xref>.<disp-formula id="e34">
<mml:math id="m90">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
<mml:msqrt>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
<label>(34)</label>
</disp-formula>
</p>
<p>In the multi-head form, multiple such attention mechanisms run in parallel using <xref ref-type="disp-formula" rid="e35">Equation 35</xref>.<disp-formula id="e35">
<mml:math id="m91">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>H</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>Z</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>O</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(35)</label>
</disp-formula>
</p>
<p>This output is passed through a position-wise feed-forward network using <xref ref-type="disp-formula" rid="e36">Equation 36</xref>.<disp-formula id="e36">
<mml:math id="m92">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="italic">max</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>x</mml:mi>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(36)</label>
</disp-formula>
</p>
<p>Residual connections and normalization are applied to preserve gradients and stabilize learning using <xref ref-type="disp-formula" rid="e37">Equation 37</xref>.<disp-formula id="e37">
<mml:math id="m93">
<mml:mrow>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>Z</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>H</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>Z</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(37)</label>
</disp-formula>
</p>
<p>This pooled vector is then passed into a fully connected classification layer followed by a softmax to predict the action label using <xref ref-type="disp-formula" rid="e38">Equation 38</xref>.<disp-formula id="e38">
<mml:math id="m94">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(38)</label>
</disp-formula>where, <inline-formula id="inf57">
<mml:math id="m95">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mi>K</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the probability distribution across human action classes. The model uses cross-entropy loss for optimization. The Transformer on fused features enhances accuracy by capturing spatial-temporal dependencies and multimodal complementarity. The workflow is in <xref ref-type="fig" rid="F11">Figure 11</xref>, and the algorithm is in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Illustration of the architecture of a Transformer encoder followed by a classification layer.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g011.tif">
<alt-text content-type="machine-generated">Diagram illustrating a transformer model architecture and its classification layer. On the left, the procedure includes Add &#x26; Normalize, Feed Forward Networks (FFN), Layer Normalization, and Multi-Head Self-Attention (MHSA). Positional encodings are added to inputs. On the right, a neural network structure shows layers: input, hidden, and output, interconnected by green nodes.</alt-text>
</graphic>
</fig>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>3D-poseformer: Multimodal-depth exercise recognition via 3D-mesh and Transformer.</p>
</caption>
<table>
<tbody valign="top">
<tr>
<td align="left">3D-PoseFormer: Multimodal RGB-Depth Exercise Recognition via 3D Mesh and Transformer</td>
</tr>
<tr>
<td align="left">&#x2003;Input: RGB_image, Depth_image</td>
</tr>
<tr>
<td align="left">&#x2003;Output: exercise_label: Recognized exercise class</td>
</tr>
<tr>
<td align="left">&#x2003;Algorithm</td>
</tr>
<tr>
<td align="left">&#x2003;1. procedure MAIN(RGB_image, Depth_image)</td>
</tr>
<tr>
<td align="left">&#x2003;2.&#x2003;rgb_features &#x2190; PROCESS_RGB(RGB_image)</td>
</tr>
<tr>
<td align="left">&#x2003;3.&#x2003;depth_features &#x2190; PROCESS_DEPTH(Depth_image)</td>
</tr>
<tr>
<td align="left">&#x2003;4.&#x2003;fused_features &#x2190; FUSE_FEATURES(rgb_features, depth_features)</td>
</tr>
<tr>
<td align="left">&#x2003;5.&#x2003;predictions &#x2190; TRANSFORMER_CLASSIFIER(fused_features)</td>
</tr>
<tr>
<td align="left">&#x2003;6.&#x2003;return predictions</td>
</tr>
<tr>
<td align="left">&#x2003;7.&#x2003;end procedure</td>
</tr>
<tr>
<td align="left">&#x2003;8.&#x2003;procedure PROCESS_RGB(image)</td>
</tr>
<tr>
<td align="left">&#x2003;9.&#x2003;preprocessed &#x2190; PREPROCESS_RGB(image)</td>
</tr>
<tr>
<td align="left">&#x2003;10.&#x2003;silhouette &#x2190; SEGMENT_HUMAN(preprocessed)</td>
</tr>
<tr>
<td align="left">&#x2003;11.&#x2003;keypoints &#x2190; EXTRACT_2D_KEYPOINTS(silhouette)</td>
</tr>
<tr>
<td align="left">&#x2003;12.&#x2003;part_labels &#x2190; BODY_PART_PARSING(silhouette)</td>
</tr>
<tr>
<td align="left">&#x2003;13.&#x2003;contour_points &#x2190; EXTRACT_CONTOUR(part_labels)</td>
</tr>
<tr>
<td align="left">&#x2003;14.&#x2003;return CONCAT(keypoints, contour_points)</td>
</tr>
<tr>
<td align="left">&#x2003;15.&#x2003;end procedure</td>
</tr>
<tr>
<td align="left">&#x2003;16.&#x2003;procedure PROCESS_DEPTH(image)</td>
</tr>
<tr>
<td align="left">&#x2003;17.&#x2003;cleaned &#x2190; PREPROCESS_DEPTH(image)</td>
</tr>
<tr>
<td align="left">&#x2003;18.&#x2003;silhouette &#x2190; SEGMENT_HUMAN_DEPTH(cleaned)</td>
</tr>
<tr>
<td align="left">&#x2003;19.&#x2003;keypoints_3D &#x2190; EXTRACT_3D_KEYPOINTS(silhouette)</td>
</tr>
<tr>
<td align="left">&#x2003;20.&#x2003;mesh &#x2190; RECONSTRUCT_3D_MESH(keypoints_3D)</td>
</tr>
<tr>
<td align="left">&#x2003;21.&#x2003;smpl_params &#x2190; FIT_SMPL(mesh, keypoints_3D)</td>
</tr>
<tr>
<td align="left">&#x2003;22.&#x2003;return smpl_params</td>
</tr>
<tr>
<td align="left">&#x2003;23.&#x2003;end procedure</td>
</tr>
<tr>
<td align="left">&#x2003;24.&#x2003;procedure FUSE_FEATURES(rgb_feats, depth_feats)</td>
</tr>
<tr>
<td align="left">&#x2003;25.&#x2003;fused &#x2190; CONCAT(rgb_feats, depth_feats)</td>
</tr>
<tr>
<td align="left">&#x2003;26.&#x2003;return fused</td>
</tr>
<tr>
<td align="left">&#x2003;27.&#x2003;end procedure</td>
</tr>
<tr>
<td align="left">&#x2003;28.&#x2003;procedure TRANSFORMER_CLASSIFIER(features)</td>
</tr>
<tr>
<td align="left">&#x2003;29.&#x2003;embedded &#x2190; EMBED(features)</td>
</tr>
<tr>
<td align="left">&#x2003;30.&#x2003;positional &#x2190; ADD_POSITIONAL_ENCODING(embedded)</td>
</tr>
<tr>
<td align="left">&#x2003;31.&#x2003;for each layer in TRANSFORMER_ENCODER_STACK do</td>
</tr>
<tr>
<td align="left">&#x2003;32.&#x2003;&#x2003;positional &#x2190; TRANSFORMER_ENCODER(positional)</td>
</tr>
<tr>
<td align="left">&#x2003;33.&#x2003;end for</td>
</tr>
<tr>
<td align="left">&#x2003;34.&#x2003;output &#x2190; CLASSIFY(positional)</td>
</tr>
<tr>
<td align="left">&#x2003;35.&#x2003;return output</td>
</tr>
<tr>
<td align="left">&#x2003;36.&#x2003;end procedure</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Results and evaluation</title>
<sec id="s4-1">
<label>4.1</label>
<title>Experimental setup</title>
<p>Complete implementation was conducted on a Google Colab virtual machine with an NVIDIA Tesla T4 GPU with 16&#xa0;GB GDDR6 VRAM, 2,560 CUDA cores, and 320 Tensor cores, running Ubuntu 18.04.6 LTS. The environment used Python 3.10.13 with PyTorch 2.1.0&#x2b;cu118 and cuDNN 8.9.1, enabling GPU-accelerated tensor computations and convolution operations. We used TorchVision 0.16.0 for image processing, NumPy 1.25.0 and Pandas 2.1.1 for data handling, and Matplotlib 3.8.0 and Seaborn 0.12.3 for visualization.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Datasets</title>
<p>We used the KIMORE dataset, which includes data from 78 subjects (44 healthy, 34 with low-back pain) performing five rehabilitation exercises. It provides RGB and depth videos, 25-joint skeleton positions, and clinical scores for each repetition, supporting intelligent remote rehabilitation monitoring. Additionally, we incorporated the mRI dataset, a multi-modal 3D pose estimation resource with over 5 million frames from 20 subjects, captured using RGB-D cameras, mmWave radar, and IMUs. This dataset aims to advance home-based health monitoring. Further to test the generalizability we thoughtfully selected UTKinect-Action3D action recognition dataset. The dataset records 10 subjects performing 10 daily-life actions, with synchronized RGB, depth, and skeletal data for generalizable action recognition in physical therapy.</p>
<p>The presented work is motivated by the rehabilitation of patients with lower back pain (LBP), and the KIMORE dataset directly reflects this scenario through rehabilitation-specific exercises performed by both healthy subjects and LBP patients. To complement this, we included the mRI and UTKinect-Action3D datasets to strengthen the generalization of the rehabilitation framework. The mRI dataset contains multi-modal recordings of repetitive and complex movements (e.g., bending, squatting, reaching), which closely resemble the functional motions targeted in LBP rehabilitation. Similarly, the UTKinect-Action3D dataset includes a wide variety of full-body actions that involve spinal mobility and trunk stability (e.g., bending, lifting, side movements), both of which are central components in evaluating rehabilitation progress for lower back disorders. By training and validating our model on these datasets, we ensure that the system is not overfitted to one rehabilitation dataset but can generalize to broader variations of human motion that are highly relevant to lower back rehabilitation tasks.</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Confusion matrices</title>
<p>
<xref ref-type="table" rid="T4">Table 4</xref> shows the confusion matrix for correctness classification on the KIMORE dataset, achieving a 94.73% overall accuracy. Exercises E1 (0.92), E4 (0.97), and E5 (0.97) were classified highly accurately. Minor misclassifications occurred between E2 and E3, likely due to similar movement patterns.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Confusion matrix for correctness classification for KIMORE dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Class</th>
<th align="center">E1</th>
<th align="center">E2</th>
<th align="center">E3</th>
<th align="center">E4</th>
<th align="center">E5</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">E1</td>
<td align="center">0.92</td>
<td align="center">0.03</td>
<td align="center">0.03</td>
<td align="center">0.01</td>
<td align="center">0.02</td>
</tr>
<tr>
<td align="center">E3</td>
<td align="center">0.02</td>
<td align="center">0.93</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.04</td>
</tr>
<tr>
<td align="center">E3</td>
<td align="center">0.02</td>
<td align="center">0.06</td>
<td align="center">0.88</td>
<td align="center">0.02</td>
<td align="center">0.02</td>
</tr>
<tr>
<td align="center">E4</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.97</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">E5</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.97</td>
</tr>
<tr>
<td colspan="3" align="center">Accuracy</td>
<td colspan="3" align="center">
<bold>94.73%</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T5">Table 5</xref> presents the confusion matrix for the mRI dataset, with a 91% overall accuracy across 12 exercise classes. Exercises such as E2, E3, E5, E6, E7, E9, E10, E11, and E12 showed excellent recognition (&#x2265;0.86). Some confusion occurred, notably for E1 and E4, due to overlapping execution characteristics. <xref ref-type="table" rid="T6">Table 6</xref> shows results for the UTKinect-Action3D dataset, achieving 94.2% overall accuracy. Actions like Clap Hands (0.98), Wave Hands (0.97), Pick Up (0.97), and Throw (0.91) were classified with high precision. Minor confusion appeared between motion-similar actions like Stand Up and Walk and Carry and Pull. Overall, the model demonstrated strong classification performance across all three datasets, with most errors arising from visually or kinematically similar actions.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Confusion matrix for correctness classification for mRI dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Class</th>
<th align="center">E1</th>
<th align="center">E2</th>
<th align="center">E3</th>
<th align="center">E4</th>
<th align="center">E5</th>
<th align="center">E6</th>
<th align="center">E7</th>
<th align="center">E8</th>
<th align="center">E9</th>
<th align="center">E10</th>
<th align="center">E11</th>
<th align="center">E12</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">E1</td>
<td align="center">0.47</td>
<td align="center">0.06</td>
<td align="center">0.00</td>
<td align="center">0.06</td>
<td align="center">0.06</td>
<td align="center">0.12</td>
<td align="center">0.00</td>
<td align="center">0.06</td>
<td align="center">0.00</td>
<td align="center">0.06</td>
<td align="center">0.00</td>
<td align="center">0.12</td>
</tr>
<tr>
<td align="center">E2</td>
<td align="center">0.00</td>
<td align="center">0.86</td>
<td align="center">0.01</td>
<td align="center">0.06</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.04</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">E3</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.91</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">E4</td>
<td align="center">0.00</td>
<td align="center">0.04</td>
<td align="center">0.00</td>
<td align="center">0.67</td>
<td align="center">0.00</td>
<td align="center">0.08</td>
<td align="center">0.04</td>
<td align="center">0.08</td>
<td align="center">0.04</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.04</td>
</tr>
<tr>
<td align="center">E5</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.89</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.04</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">E6</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.92</td>
<td align="center">0.00</td>
<td align="center">0.03</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
</tr>
<tr>
<td align="center">E7</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.03</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.91</td>
<td align="center">0.02</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
</tr>
<tr>
<td align="center">E8</td>
<td align="center">0.04</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.04</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.04</td>
<td align="center">0.83</td>
<td align="center">0.04</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">E9</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.03</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.92</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">E10</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.99</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">E11</td>
<td align="center">0.01</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.93</td>
<td align="center">0.02</td>
</tr>
<tr>
<td align="center">E12</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.98</td>
</tr>
<tr>
<td colspan="6" align="center">Accuracy</td>
<td colspan="7" align="center">
<bold>91%</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Confusion matrix for correctness classification for UTKinect-Action3D dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Class</th>
<th align="center">Walk</th>
<th align="center">Sit down</th>
<th align="center">Stand up</th>
<th align="center">Pick up</th>
<th align="center">Carry</th>
<th align="center">Throw</th>
<th align="center">Push</th>
<th align="center">Pull</th>
<th align="center">Wave hands</th>
<th align="center">Clap hands</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Walk</td>
<td align="center">0.90</td>
<td align="center">0.00</td>
<td align="center">0.20</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.05</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">Sit down</td>
<td align="center">0.02</td>
<td align="center">0.89</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.03</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">Stand up</td>
<td align="center">0.02</td>
<td align="center">0.01</td>
<td align="center">0.60</td>
<td align="center">0.01</td>
<td align="center">0.03</td>
<td align="center">0.00</td>
<td align="center">0.03</td>
<td align="center">0.05</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">Pick up</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.10</td>
<td align="center">0.97</td>
<td align="center">0.05</td>
<td align="center">0.02</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">Carry</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.84</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.09</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
</tr>
<tr>
<td align="center">Throw</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.91</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">Push</td>
<td align="center">0.05</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.03</td>
<td align="center">0.02</td>
<td align="center">0.92</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
</tr>
<tr>
<td align="center">Pull</td>
<td align="center">0.00</td>
<td align="center">0.03</td>
<td align="center">0.10</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.82</td>
<td align="center">0.01</td>
<td align="center">0.01</td>
</tr>
<tr>
<td align="center">Wave hands</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.03</td>
<td align="center">0.05</td>
<td align="center">0.01</td>
<td align="center">0.00</td>
<td align="center">0.97</td>
<td align="center">0.00</td>
</tr>
<tr>
<td align="center">Clap Hands</td>
<td align="center">0.00</td>
<td align="center">0.02</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.00</td>
<td align="center">0.01</td>
<td align="center">0.98</td>
</tr>
<tr>
<td colspan="6" align="center">Accuracy</td>
<td colspan="5" align="left">
<bold>94.2</bold>%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-4">
<label>4.4</label>
<title>Classification performance evaluation</title>
<p>
<xref ref-type="table" rid="T7">Table 7</xref> reports the precision, recall, and F1-score values for correctness classification on the KIMORE dataset. For the KIMORE dataset, the model demonstrated excellent performance across all five exercise classes. The highest scores were achieved for Exercise 5 (E5), with a precision of 0.98, recall of 0.96, and an F1-score of 0.97, followed by Exercise 4 (E4), which recorded consistent values of 0.96 for both precision and recall, resulting in an F1-score of 0.96. While Exercise 1 (E1) also achieved strong results with a precision of 0.93 and recall of 0.95, slightly lower values were observed for Exercise 2 (E2) and Exercise 3 (E3), with F1-scores of 0.89 and 0.87, respectively. These lower values correspond with the confusion matrix findings, where misclassifications between E2 and E3 were noted, highlighting areas where the system occasionally struggles to differentiate similar movement patterns.</p>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>Precision, recall, and F1-score results over KIMORE Dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Exercises</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">E1</td>
<td align="center">0.93</td>
<td align="center">0.95</td>
<td align="center">0.94</td>
</tr>
<tr>
<td align="center">E2</td>
<td align="center">0.89</td>
<td align="center">0.90</td>
<td align="center">0.89</td>
</tr>
<tr>
<td align="center">E3</td>
<td align="center">0.86</td>
<td align="center">0.87</td>
<td align="center">0.87</td>
</tr>
<tr>
<td align="center">E4</td>
<td align="center">0.96</td>
<td align="center">0.96</td>
<td align="center">0.96</td>
</tr>
<tr>
<td align="center">E5</td>
<td align="center">0.98</td>
<td align="center">0.96</td>
<td align="center">0.97</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T8">Table 8</xref> reports the precision, recall, and F1-score values for the mRI dataset across twelve different exercise classes. The model demonstrated strong and consistent performance on most exercises. Notably, E11 achieved the highest scores with a precision of 0.93, recall of 0.92, and F1-score of 0.93, followed closely by E10 (precision: 0.89, recall: 0.94, F1-score: 0.92) and E3 (precision: 0.91, recall: 0.92, F1-score: 0.91). Exercises E5, E7, E9, and E12 also showed high F1-scores of 0.89, indicating robust classification in these categories. However, E1 exhibited the lowest performance, with a recall of 0.47 and an F1-score of 0.59, suggesting challenges in accurately identifying this exercise. Overall, the model shows promising recognition capability across the dataset, with a few classes like E1 and E4 (F1-score: 0.72) requiring further attention to enhance classification accuracy.</p>
<table-wrap id="T8" position="float">
<label>TABLE 8</label>
<caption>
<p>Precision, recall, and F1-score results over mRI dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Exercises</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">E1</td>
<td align="center">0.82</td>
<td align="center">0.47</td>
<td align="center">0.59</td>
</tr>
<tr>
<td align="center">E2</td>
<td align="center">0.86</td>
<td align="center">0.87</td>
<td align="center">0.86</td>
</tr>
<tr>
<td align="center">E3</td>
<td align="center">0.91</td>
<td align="center">0.92</td>
<td align="center">0.91</td>
</tr>
<tr>
<td align="center">E4</td>
<td align="center">0.76</td>
<td align="center">0.68</td>
<td align="center">0.72</td>
</tr>
<tr>
<td align="center">E5</td>
<td align="center">0.89</td>
<td align="center">0.90</td>
<td align="center">0.89</td>
</tr>
<tr>
<td align="center">E6</td>
<td align="center">0.79</td>
<td align="center">0.93</td>
<td align="center">0.86</td>
</tr>
<tr>
<td align="center">E7</td>
<td align="center">0.88</td>
<td align="center">0.89</td>
<td align="center">0.89</td>
</tr>
<tr>
<td align="center">E8</td>
<td align="center">0.78</td>
<td align="center">0.84</td>
<td align="center">0.81</td>
</tr>
<tr>
<td align="center">E9</td>
<td align="center">0.88</td>
<td align="center">0.90</td>
<td align="center">0.89</td>
</tr>
<tr>
<td align="center">E10</td>
<td align="center">0.89</td>
<td align="center">0.94</td>
<td align="center">0.92</td>
</tr>
<tr>
<td align="center">E11</td>
<td align="center">0.93</td>
<td align="center">0.92</td>
<td align="center">0.93</td>
</tr>
<tr>
<td align="center">E12</td>
<td align="center">0.82</td>
<td align="center">0.98</td>
<td align="center">0.89</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T9">Table 9</xref> reports the precision, recall, and F1-score values for the UTKinect-Action3D dataset. The model performed well across all actions, with Clap hands achieving the highest scores (precision: 0.97, recall: 0.97, F1-score: 0.97), followed by Throw (precision: 0.91, recall: 0.99, F1-score: 0.95) and Pick up (precision: 0.98, recall: 0.84, F1-score: 0.90). Other actions like Carry and Push also showed strong results. However, Stand-up had lower performance (precision: 0.60, recall: 0.80, F1-score: 0.69), indicating difficulties in differentiation. Overall, the model demonstrated strong action recognition, with room for improvement in Stand-up classification.</p>
<table-wrap id="T9" position="float">
<label>TABLE 9</label>
<caption>
<p>Precision, recall, and F1-score results over UTKinect-Action3D dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Exercises</th>
<th align="left">Precision</th>
<th align="left">Recall</th>
<th align="left">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Walk</td>
<td align="left">0.91</td>
<td align="left">0.77</td>
<td align="left">0.83</td>
</tr>
<tr>
<td align="left">Sit down</td>
<td align="left">0.89</td>
<td align="left">0.93</td>
<td align="left">0.91</td>
</tr>
<tr>
<td align="left">Stand up</td>
<td align="left">0.60</td>
<td align="left">0.80</td>
<td align="left">0.69</td>
</tr>
<tr>
<td align="left">Pick up</td>
<td align="left">0.98</td>
<td align="left">0.84</td>
<td align="left">0.90</td>
</tr>
<tr>
<td align="left">Carry</td>
<td align="left">0.83</td>
<td align="left">0.87</td>
<td align="left">0.85</td>
</tr>
<tr>
<td align="left">Throw</td>
<td align="left">0.91</td>
<td align="left">0.99</td>
<td align="left">0.95</td>
</tr>
<tr>
<td align="left">Push</td>
<td align="left">0.91</td>
<td align="left">0.88</td>
<td align="left">0.90</td>
</tr>
<tr>
<td align="left">Pull</td>
<td align="left">0.81</td>
<td align="left">0.85</td>
<td align="left">0.83</td>
</tr>
<tr>
<td align="left">Wave hands</td>
<td align="left">0.96</td>
<td align="left">0.90</td>
<td align="left">0.93</td>
</tr>
<tr>
<td align="left">Clap hands</td>
<td align="left">0.97</td>
<td align="left">0.97</td>
<td align="left">0.97</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The model showed strong classification on the KIMORE dataset (<xref ref-type="fig" rid="F12">Figure 12</xref>), with high AUCs across exercises. E5 (0.98), E4 (0.97), and E2 (0.96) had near-perfect discrimination, while E1, though lower, still achieved 0.88. The mean AUC was 0.94, highlighting robust overall performance, with E1 likely being harder to distinguish due to movement similarities. The model showed strong discriminative performance across all exercises (E1&#x2013;E12) in the mRI dataset, as shown by the ROC analysis in <xref ref-type="fig" rid="F13">Figure 13</xref>. Most exercises achieved excellent AUCs, with E12 (0.99), E9&#x2013;E11 (0.98), and E3, E6, and E7 (0.96) performing exceptionally well. E2 (0.93) and E8 (0.91) also maintained high performance. The mean AUC was 0.93, far above random guessing. E1 (0.75) and E4 (0.83) had lower scores, suggesting greater classification challenges due to overlapping kinematics.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>ROC Curves for KIMORE dataset.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g012.tif">
<alt-text content-type="machine-generated">ROC curves for the KIMORE dataset, showing true positive rate versus false positive rate. Curves for models E1 to E5 have AUC values of 0.88, 0.96, 0.93, 0.97, and 0.98, respectively. The mean ROC curve has an AUC of 0.94. A dashed line represents random guessing.</alt-text>
</graphic>
</fig>
<fig id="F13" position="float">
<label>FIGURE 13</label>
<caption>
<p>ROC Curves for mRI dataset.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g013.tif">
<alt-text content-type="machine-generated">ROC curves for an mRI dataset display true positive rate versus false positive rate for twelve experiments, labeled E1 to E12, with varying AUC values ranging from 0.75 to 0.99. The mean ROC is 0.93. A grey line represents random guessing.</alt-text>
</graphic>
</fig>
<p>Overall, the model demonstrated robust exercise recognition in the mRI dataset. The model showed strong performance on the UTKinect-Action3D dataset (<xref ref-type="fig" rid="F14">Figure 14</xref>), with most actions achieving high AUCs. &#x201c;Clap hands&#x201d; reached 1.00, &#x201c;Wave hands&#x201d; 0.99, and &#x201c;Sit down,&#x201d; &#x201c;Pick up,&#x201d; and &#x201c;Push&#x201d; all 0.98. &#x201c;Stand up&#x201d; had a lower AUC of 0.79. The mean AUC was 0.95, indicating robust classification with only minor challenges for &#x201c;Stand up.&#x201d;</p>
<fig id="F14" position="float">
<label>FIGURE 14</label>
<caption>
<p>ROC Curves for UTKinect-Action3D dataset.</p>
</caption>
<graphic xlink:href="fbioe-13-1631910-g014.tif">
<alt-text content-type="machine-generated">ROC curves for the UTKinect-Action3D dataset, depicting various actions such as walk, sit down, stand up, pick up, throw, and more. Each curve shows true positive rate versus false positive rate. Area Under Curve (AUC) values range from 0.79 to 1.00, with &#x22;Clap hands&#x22; at 1.00. The mean ROC has an AUC of 0.95.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-5">
<label>4.5</label>
<title>Comparison with state-of-the-art</title>
<p>
<xref ref-type="table" rid="T10">Table 10</xref> compares recent studies on rehabilitation exercise recognition. <xref ref-type="bibr" rid="B20">Jleli et al. (2024)</xref> achieved 87% accuracy with YOLO V5 and ShuffleNet V2 on KIMORE, while <xref ref-type="bibr" rid="B49">Zaher et al. (2024)</xref> improved it to 93.08% with CNN optimization. <xref ref-type="bibr" rid="B50">Zaher et al. (2025)</xref> reported 81.85% using a hybrid FCBF-Extra Trees model. For UTKinect-Action3D, <xref ref-type="bibr" rid="B23">Ke&#xe7;eli et al. (2022)</xref>, <xref ref-type="bibr" rid="B12">Ding et al. (2018)</xref>, and <xref ref-type="bibr" rid="B25">Kumar et al. (2024)</xref> achieved 93.4%, 91.5%, and 93.5% accuracy, respectively. <xref ref-type="bibr" rid="B4">An et al. (2022)</xref> achieved mAP scores of 91.56% and 95.07% with ActionFormer on mRI. The proposed model outperforms previous work with 94.73% (KIMORE), 91% (mRI), and 94.2% (UTKinect-Action3D), demonstrating superior generalizability.</p>
<table-wrap id="T10" position="float">
<label>TABLE 10</label>
<caption>
<p>Comparison of methodologies, datasets, and results from recent studies on physical rehabilitation exercise recognition and assessment.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Author</th>
<th align="center">Title</th>
<th align="center">Methodology</th>
<th align="center">Dataset</th>
<th align="center">Results</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">
<xref ref-type="bibr" rid="B20">Jleli et al. (2024)</xref>
</td>
<td align="center">Artificial Intelligence-driven Remote Monitoring Model for Physical Rehabilitation</td>
<td align="center">YOLO V5&#x2013;ShuffleNet V2</td>
<td align="center">KIMORE</td>
<td align="center">Accuracy &#x3d; 87.00%</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B49">Zaher et al. (2024)</xref>
</td>
<td align="center">Unlocking the potential of RNN and CNN models for accurate rehabilitation exercise classification on multi-datasets</td>
<td align="center">CNN with hyperparameter tuning</td>
<td align="center">KIMORE</td>
<td align="center">Accuracy &#x3d; 93.08%</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B50">Zaher et al. (2025)</xref>
</td>
<td align="center">Rehabilitation monitoring and assessment: a comparative analysis of feature engineering and machine learning algorithms on the UI-PRMD and KIMORE benchmark datasets</td>
<td align="center">The combination of FCBF for feature ranking and Extra Trees classifier</td>
<td align="center">KIMORE</td>
<td align="center">Accuracy &#x3d; 81.85%</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B23">Ke&#xe7;eli et al. (2022)</xref>
</td>
<td align="center">3D Skeletal Volume Templates for Deep Learning-Based Activity Recognition</td>
<td align="center">HOG &#x2b; Deep Features</td>
<td align="center">UTKinect-Action3D Dataset</td>
<td align="center">Accuracy &#x3d; 93.40%</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B12">Ding et al. (2018)</xref>
</td>
<td align="center">Human Action Recognition Using Similarity Degree Between Postures and Spectral Learning</td>
<td align="center">Rotation Matrix Representation-Based 3D (RMRB3D) with Singular Value Decomposition (SVD) and Hidden Markov Model (HMM)</td>
<td align="center">UTKinect-Action3D Dataset</td>
<td align="center">Accuracy &#x3d; 91.50%</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B25">Kumar et al. (2024)</xref>
</td>
<td align="center">Human Action Recognition from Depth Sensor via Skeletal Joint and Shape Trajectories with a Time-Series Graph Matching</td>
<td align="center">Time-Series Graph Matching (TSGM)</td>
<td align="center">UTKinect-Action3D Dataset</td>
<td align="center">Accuracy &#x3d; 93.50%</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B4">An et al. (2022)</xref>
</td>
<td align="center">Multi-modal 3D Human Pose Estimation using mmWave, RGB-D, and Inertial Sensors</td>
<td align="center">ActionFormer</td>
<td align="center">mRI: Multi-modal 3D Human Pose Estimation Dataset using mmWave, RGB-D, and Inertial Sensors</td>
<td align="center">Protocol 1 (Random split) mAP &#x3d; 91.56<break/>Protocol 2 (Subject-wise split) mAP &#x3d; 95.07</td>
</tr>
<tr>
<td rowspan="3" colspan="2" align="center">Proposed</td>
<td align="center">KMORE</td>
<td colspan="2" align="center">
<bold>94.73%</bold>
</td>
</tr>
<tr>
<td align="center">mRI</td>
<td colspan="2" align="center">
<bold>91.00%</bold>
</td>
</tr>
<tr>
<td align="center">UTKinect-Action3D</td>
<td colspan="2" align="center">
<bold>94.20%</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-6">
<label>4.6</label>
<title>Ablation study</title>
<p>An ablation study was performed to evaluate the contribution of each feature stream as well as the impact of feature dimensionality on model performance (<xref ref-type="table" rid="T11">Table 11</xref>). The full model, which integrates preprocessing, 3D Mesh features, 2D keypoints, and BPL-based contour points, achieved the highest accuracy across all datasets. For the 3D Mesh features, reducing the number of SMPL vertices by 25% and 50% produced only moderate accuracy declines compared to the full-resolution mesh, while complete removal caused the largest performance drop (KIMORE: 94.73%&#x2013;91.79% to 90.52%&#x2013;87.00%). The smooth decline across these conditions indicates that the model does not simply memorize high-dimensional details but continues to generalize well even with fewer vertices. This suggests a low risk of overfitting to mesh complexity, while still confirming the strong importance of biomechanical information. For the 2D keypoints, models trained with individual detectors (AKAZE, SIFT, BRISK, ORB, Shi-Tomasi) achieved stable accuracy in the 90%&#x2013;93% range, while the fused vector consistently outperformed single detectors. This consistency across different detectors demonstrates that the model is not overfitting to the idiosyncrasies of any one keypoint representation. Instead, it learns complementary information from multiple detectors, thereby improving generalization and robustness.</p>
<table-wrap id="T11" position="float">
<label>TABLE 11</label>
<caption>
<p>Ablation study on model configurations and their impact on exercise recognition accuracy across KIMORE, UTKinect-Action3D and mRI datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Model configuration</th>
<th align="center">Description</th>
<th align="center">KIMORE accuracy (%)</th>
<th align="center">mRI accuracy (%)</th>
<th align="center">UTKinect-Action3D accuracy (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">All Parameters (Preprocessing, 3D Mesh, 2D Keypoints, BPL-based Contour Points)</td>
<td align="center">Model trained using all feature extraction techniques</td>
<td align="center">94.73%</td>
<td align="center">91.00%</td>
<td align="center">94.20%</td>
</tr>
<tr>
<td align="center">Without Preprocessing</td>
<td align="center">Model trained without image preprocessing</td>
<td align="center">90.50%</td>
<td align="center">87.30%</td>
<td align="center">88.80%</td>
</tr>
<tr>
<td align="center">Without 25% 3D Mesh Vertices</td>
<td align="center">Model trained with 25% reduced 3D Mesh Vertices</td>
<td align="center">91.79%</td>
<td align="center">87.50%</td>
<td align="center">88.80%</td>
</tr>
<tr>
<td align="center">Without 50% 3D Mesh Vertices</td>
<td align="center">Model trained with 50% reduced 3D Mesh Vertices</td>
<td align="center">90.52%</td>
<td align="center">87.10%</td>
<td align="center">86.20%</td>
</tr>
<tr>
<td align="center">Without 3D Mesh</td>
<td align="center">Model trained without 3D mesh features</td>
<td align="center">87.00%</td>
<td align="center">85.00%</td>
<td align="center">82.00%</td>
</tr>
<tr>
<td align="center">Without 2D Keypoint (AKAZE)</td>
<td align="center">Model trained with AKAZE keypoints</td>
<td align="center">91.20%</td>
<td align="center">89.40%</td>
<td align="center">92.25%</td>
</tr>
<tr>
<td align="center">Without 2D Keypoint (SIFT)</td>
<td align="center">Model trained with SIFT keypoints</td>
<td align="center">90.25%</td>
<td align="center">88.20%</td>
<td align="center">90.20%</td>
</tr>
<tr>
<td align="center">Without 2D Keypoint (BRISK)</td>
<td align="center">Model trained with BRISK keypoints</td>
<td align="center">91.50%</td>
<td align="center">89.10%</td>
<td align="center">91.90%</td>
</tr>
<tr>
<td align="center">Without 2D Keypoint (ORB)</td>
<td align="center">Model trained with ORB keypoints</td>
<td align="center">90.75%</td>
<td align="center">89.40%</td>
<td align="center">91.80%</td>
</tr>
<tr>
<td align="center">Without 2D Keypoint (Shi Tomasi)</td>
<td align="center">Model trained with Shi Tomasi keypoints</td>
<td align="center">92.30%</td>
<td align="center">89.70%</td>
<td align="center">93.20%</td>
</tr>
<tr>
<td align="center">Without Complete 2D Keypoints Vector</td>
<td align="center">Model trained without all 2D keypoint features</td>
<td align="center">91.00%</td>
<td align="center">89.00%</td>
<td align="center">86.00%</td>
</tr>
<tr>
<td align="center">Without 50% BPL-based Contour Points</td>
<td align="center">Model trained without 50% BPL-based contour points (Random Selection with Uniform number of keypoints per body part)</td>
<td align="center">92.70%</td>
<td align="center">90.40%</td>
<td align="center">91.20%</td>
</tr>
<tr>
<td align="center">Without BPL-based Contour Points</td>
<td align="center">Model trained without BPL-based contour points</td>
<td align="center">91.60%</td>
<td align="center">89.40%</td>
<td align="center">88.90%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>For the BPL-based contour features, randomly pruning 50% of contour points while maintaining uniform distribution across body parts led to only a minor accuracy drop (KIMORE: 94.73%&#x2013;92.70%), while complete removal produced a slightly larger decline. This indicates that the framework remains reliable even when partial information is missing, showing resilience to noise and occlusion. Overall, the ablation results confirm that the model maintains strong performance under reduced feature dimensionality and noisy conditions, demonstrating both robustness and resistance to overfitting.</p>
</sec>
<sec id="s4-7">
<label>4.7</label>
<title>Computational cost analysis</title>
<p>We evaluated the computational cost of all the major components of proposed architecture as shown in <xref ref-type="table" rid="T12">Table 12</xref>, the pipeline exhibits a clear distinction between lightweight classical techniques and computationally intensive deep learning models. Classical keypoint detection methods, including corner and feature detection, are highly efficient and contribute minimally to overall computational cost. In contrast, stages such as semantic segmentation, body part parsing, 3D keypoint extraction, and pose fitting dominate processing, forming the primary bottlenecks in the system. Feature fusion operations are lightweight, while transformer-based inference introduces moderate computational overhead. Overall, the pipeline relies on GPU acceleration for near real-time performance, with optimization of segmentation and pose-fitting stages offering the greatest potential for improving throughput.</p>
<table-wrap id="T12" position="float">
<label>TABLE 12</label>
<caption>
<p>FLOPs, and estimated time per frame for 3D-PoseFormer pipeline.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Stage</th>
<th align="center">Technique</th>
<th align="center">Estimated time per frame (ms)</th>
<th align="center">FLOPs (GFLOPs)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">RGB-KPD</td>
<td align="center">Shi-Tomasi Corner Detection</td>
<td align="center">1.2</td>
<td align="center">0.02</td>
</tr>
<tr>
<td align="left"/>
<td align="center">AKAZE Feature Detection</td>
<td align="center">1.3</td>
<td align="center">0.03</td>
</tr>
<tr>
<td align="left"/>
<td align="center">BRISK Feature Detection</td>
<td align="center">1.3</td>
<td align="center">0.03</td>
</tr>
<tr>
<td align="left"/>
<td align="center">SIFT Feature Detection</td>
<td align="center">1.8</td>
<td align="center">0.04</td>
</tr>
<tr>
<td align="left"/>
<td align="center">Harris Corner Detection</td>
<td align="center">1.2</td>
<td align="center">0.02</td>
</tr>
<tr>
<td align="left"/>
<td align="center">DeepLabV3&#x2b; with ResNet-101 (Segmentation)</td>
<td align="center">12.0</td>
<td align="center">15.6</td>
</tr>
<tr>
<td align="left"/>
<td align="center">Body Part Labeling (Single-Human-Parsing-LIP)</td>
<td align="center">2.5</td>
<td align="center">0.8</td>
</tr>
<tr>
<td align="left"/>
<td align="center">Contour-Based Keypoint Extraction</td>
<td align="center">0.6</td>
<td align="center">0.01</td>
</tr>
<tr>
<td align="center">D-Mesh</td>
<td align="center">DKP-Net-24 (3D Keypoint Extraction, L and R pipelines)</td>
<td align="center">12.0</td>
<td align="center">0.5</td>
</tr>
<tr>
<td align="left"/>
<td align="center">SMPL Fitting (Pose/Shape Optimization)</td>
<td align="center">18.0</td>
<td align="center">2.3</td>
</tr>
<tr>
<td align="center">Feature Fusion</td>
<td align="center">Concatenation of RGB and Depth Features</td>
<td align="center">2.5</td>
<td align="center">0.01</td>
</tr>
<tr>
<td align="center">Transformer Inference</td>
<td align="center">Transformer Encoder (4 layers, 8 heads, 512 dims)</td>
<td align="center">10.0</td>
<td align="center">1.8</td>
</tr>
<tr>
<td align="center">Total</td>
<td align="left"/>
<td align="center">&#x223c;55</td>
<td align="center">20.31</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>In this work, we proposed a novel multimodal deep learning pipeline for automated recognition and assessment of physiotherapy exercises, specifically designed for remote rehabilitation of physically disabled individuals. Unlike existing systems that rely on wearable sensors, markers, or controlled clinical environments, our framework leverages only RGB and depth data to deliver accurate, real-time performance evaluation in unconstrained, home-based settings. The key novelty of the proposed approach lies in its comprehensive fusion of depth-based 3D body mesh representations generated using SMPL and appearance-based features extracted from RGB images using both classical keypoint detectors and semantic contour analysis on segmented body parts. This multi-level feature representation is further enhanced by a Transformer-based temporal modeling module, enabling robust classification and fine-grained assessment of exercise execution quality. Our system outperforms prior methods on benchmark datasets, achieving 94.73% accuracy on KIMORE, 91% on mRI and 94.2% on UTKinect-Action3D demonstrating its effectiveness, generalizability, and real-world applicability. The proposed pipeline represents a significant advancement toward intelligent, scalable, and sensor-free telerehabilitation solutions.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://vrai.dii.univpm.it/content/kimore-dataset">https://vrai.dii.univpm.it/content/kimore-dataset</ext-link>; UTKinect-Action3D Dataset: <ext-link ext-link-type="uri" xlink:href="https://cvrc.ece.utexas.edu/KinectDatasets/HOJ3D.html">https://cvrc.ece.utexas.edu/KinectDatasets/HOJ3D.html</ext-link>; mRI: Multi-modal 3D Human Pose Estimation Dataset using mmWave, RGB-D, and Inertial Sensors: <ext-link ext-link-type="uri" xlink:href="https://sizhean.github.io/mri">https://sizhean.github.io/mri</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>MuA: Methodology, Writing &#x2013; original draft. YW: Data curation, Formal Analysis, Writing &#x2013; review and editing. SN: Conceptualization, Resources, Writing &#x2013; review and editing. MoA: Visualization, Writing &#x2013; review and editing. YA: Resources, Visualization, Writing &#x2013; review and editing. HA: Investigation, Software, Writing &#x2013; review and editing. AJ: Supervision, Writing &#x2013; original draft. HL: Conceptualization, Validation, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>Authors MuA, and HL were employed by Guodian Nanjing Automation Co., Ltd. Author YW is employed by Guodian Nanjing Automation Co., Ltd.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1516454/overview">Zixiang Gao</ext-link>, University of Calgary, Canada</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1744002/overview">Shao Enze</ext-link>, Ningbo University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1887519/overview">Hongjun Yang</ext-link>, Chinese Academy of Sciences (CAS), China</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B65">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Afsar</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Saqib</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Aladfaj</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alatiyyah</surname>
<given-names>M. H.</given-names>
</name>
<name>
<surname>Alnowaiser</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Aljuaid</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Body-worn sensors for recognizing physical sports activities in exergaming via deep learning model</article-title>. <source>IEEE Access</source> <volume>2023</volume>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3239692</pub-id>
</mixed-citation>
</ref>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akhter</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Javeed</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Deep skeleton modeling and hybrid hand-crafted cues over physical exercises</article-title>. <source>Proc. Int. Conf. Commun. Comput. Digital Syst. (C-CODE)</source> <volume>2023</volume>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/C-CODE58145.2023.10139863</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alcantarilla</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Nuevo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bartoli</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Fast explicit diffusion for accelerated features in nonlinear scale spaces</article-title>. <source>Proc. Br. Mach. Vis. Conf. (BMVC)</source> <volume>2011</volume>, <fpage>13.1</fpage>&#x2013;<lpage>13.11</lpage>. <pub-id pub-id-type="doi">10.5244/C.27.13</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>An</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ogras</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2022</year>). <source>mRI: multi-Modal 3D human pose estimation dataset using mmWave, RGB-D, and inertial sensors</source>. <comment>arXiv, 2022; arXiv:2210.08394</comment>.</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>She</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Muscle synergy analysis for gesture recognition based on sEMG images and shapley value</article-title>. <source>Intell. Robot.</source> <volume>3</volume>, <fpage>495</fpage>&#x2013;<lpage>513</lpage>. <pub-id pub-id-type="doi">10.20517/ir.2023.28</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ashraf</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Najam</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sadiq</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Algamdi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Aljuaid</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Rahman</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>A novel telerehabilitation system for physical exercise monitoring in elderly healthcare</article-title>. <source>IEEE Access</source> <volume>2025</volume>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3526710</pub-id>
</mixed-citation>
</ref>
<ref id="B61">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Aubry</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Laraba</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tilmanne</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dutoit</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Action recognition based on 2D skeletons extracted from RGB videos</article-title>,&#x201d; in <source>Proc. MATEC Web Conf</source>. <volume>277</volume>, <fpage>02034</fpage>. <pub-id pub-id-type="doi">10.1051/matecconf/201927702034</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Awan</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Najam</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Robust exercise-based telerehabilitation for elderly healthcare services</article-title>. <source>Proc. Int. Conf. Emerg. Trends Electr. Control, Telecommun. Eng.</source> <volume>2024</volume>, <fpage>33</fpage>&#x2013;<lpage>39</lpage>. <pub-id pub-id-type="doi">10.1109/ICET63392.2024.10935118</pub-id>
</mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Barabas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bednar</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Vychlopen</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Kinect-based platform for movement monitoring and fall-detection of elderly people</article-title>,&#x201d; in <conf-name>Proc. 12th Int.</conf-name>, <conf-loc>Smolenice, Slovakia</conf-loc>, <fpage>199</fpage>&#x2013;<lpage>202</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://ieeexplore.ieee.org/document/8780004/">https://ieeexplore.ieee.org/document/8780004/</ext-link>
</comment> (<comment>Accessed October 21, 2025</comment>).</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Simon</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>S.-E.</given-names>
</name>
<name>
<surname>Sheikh</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Realtime multi-person 2D pose estimation using part affinity fields</article-title>. <source>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</source> <fpage>7291</fpage>&#x2013;<lpage>7299</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2017.143</pub-id>
</mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Capecci</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ceravolo</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Ferracuti</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Iarlori</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Monteriu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Romeo</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>The kimore dataset: kinematic assessment of movement and clinical scores for remote monitoring of physical rehabilitation</article-title>. <source>IEEE Trans. Neural Syst. Rehabil. Eng.</source> <volume>27</volume> (<issue>7</issue>), <fpage>1436</fpage>&#x2013;<lpage>1448</lpage>. <pub-id pub-id-type="doi">10.1109/TNSRE.2019.2923060</pub-id>
<pub-id pub-id-type="pmid">31217121</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chang</surname>
<given-names>Y.-J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.-F.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J.-D.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>A kinect-based system for physical rehabilitation: a pilot study for young adults with motor disabilities</article-title>. <source>Res. Dev. Disabil.</source> <volume>32</volume> (<issue>6</issue>), <fpage>2566</fpage>&#x2013;<lpage>2570</lpage>. <pub-id pub-id-type="doi">10.1016/j.ridd.2011.07.002</pub-id>
<pub-id pub-id-type="pmid">21784612</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Human action recognition using similarity degree between postures and spectral learning</article-title>. <source>IET Comput. Vis.</source> <volume>12</volume> (<issue>1</issue>), <fpage>110</fpage>&#x2013;<lpage>117</lpage>. <pub-id pub-id-type="doi">10.1049/iet-cvi.2017.0031</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Douglas</surname>
<given-names>D. H.</given-names>
</name>
<name>
<surname>Peucker</surname>
<given-names>T. K.</given-names>
</name>
</person-group> (<year>1973</year>). <article-title>Algorithms for the reduction of the number of points required to represent a digitized line or its caricature</article-title>. <source>Can. Cartogr.</source> <volume>10</volume>, <fpage>112</fpage>&#x2013;<lpage>122</lpage>. <pub-id pub-id-type="doi">10.3138/fm57-6770-u75u-7727</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Fatima</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>A novel full-body and geometric features for physical sports interaction recognition</article-title>,&#x201d; in <source>Proc. International conference on innovative computing</source>, <fpage>1</fpage>&#x2013;<lpage>7</lpage>.</mixed-citation>
</ref>
<ref id="B64">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gumaei</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hassan</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Alelaiwi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Alsalman</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A hybrid deep learning model for human activity recognition using multimodal body sensing data</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>99152</fpage>&#x2013;<lpage>99160</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2927134</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Saini</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Aggarwal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Human activity recognition using RGB video: a survey</article-title>,&#x201d; in <source>Proc. Int. Conf. Soft Computing and Pattern Recognition (SoCPaR)</source>, <fpage>43</fpage>&#x2013;<lpage>52</lpage>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hamamoto</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hideshima</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Serikawa</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>DeepLabv3</article-title>,&#x201d; in <conf-name>Artificial Intelligence and Robotics: 8th International Symposium, ISAIR 2023</conf-name>, <conf-loc>Beijing, China</conf-loc>, <conf-date>Oct. 21&#x2013;23, 2023</conf-date> (<publisher-name>Springer Nature</publisher-name>), <fpage>181</fpage>.</mixed-citation>
</ref>
<ref id="B59">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Hamdy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Taie</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zaher</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Al-Emrany</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Mahmoud Ahmed</surname>
<given-names>O. S.</given-names>
</name>
<name>
<surname>Atia</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Enhancing physical therapy through transformer-based models: a study on exercise classification</article-title>,&#x201d; in <source>2024 Intelligent Methods, Systems, and Applications (IMSA)</source>. (<publisher-name>IEEE</publisher-name>), <fpage>366</fpage>&#x2013;<lpage>371</lpage>. <pub-id pub-id-type="doi">10.1109/IMSA61967.2024.10652817</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Harris</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Stephens</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>1988</year>). <article-title>A combined corner and edge detector</article-title>. <source>Proc. Alvey Vis. Conf.</source> <volume>1988</volume>, <fpage>147</fpage>&#x2013;<lpage>152</lpage>. <pub-id pub-id-type="doi">10.5244/C.2.23</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <source>Single-Human-Parsing-LIP: body part labeling for human parsing</source>. San <publisher-loc>Francisco, CA, United States</publisher-loc>: <publisher-name>GitHub repository</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://github.com/Yukun-Huang/Single-Human-Parsing-LIP">https://github.com/Yukun-Huang/Single-Human-Parsing-LIP</ext-link>.</comment>
</mixed-citation>
</ref>
<ref id="B63">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Javeed</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chelloug</surname>
<given-names>S. A.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201C;<article-title>Automated Gestures Recognition in Exergaming</article-title>,&#x201d; in <source>Proc. 2022 Int. Conf</source>. (<publisher-loc>Lahore, Pakistan</publisher-loc>: <publisher-name>Electrical Engineering and Sustainable Technologies (ICEEST)</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/ICEEST56292.2022.10077853</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jleli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Samet</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Dutta</surname>
<given-names>A. K.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Artificial intelligence-driven remote monitoring model for physical rehabilitation</article-title>. <source>J. Disabil. Res.</source> <volume>3</volume> (<issue>1</issue>). <pub-id pub-id-type="doi">10.57197/jdr-2023-0065</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kamal</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Alshahrani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Najam</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Alshehri</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alqahtani</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Alabdullah</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Holistic pose estimation and dynamic motion analysis for telerehabilitation of physically disabled individuals</article-title>. <source>IEEE Access</source> <volume>2025</volume>, <fpage>81279</fpage>&#x2013;<lpage>81297</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3565024</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kanazawa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Black</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Jacobs</surname>
<given-names>D. W.</given-names>
</name>
<name>
<surname>Malik</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>End-to-End recovery of human shape and pose</article-title>. <source>Proc. CVPR</source> <volume>2018</volume>, <fpage>7122</fpage>&#x2013;<lpage>7131</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2018.00744</pub-id>
</mixed-citation>
</ref>
<ref id="B66">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kaynat</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ahmed Rafique</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Wearable sensors for exergaming physical exercise monitoring via dynamic features</article-title>. <source>ComTech</source> <volume>2025</volume>, <fpage>2</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1109/ComTech65062.2025.11034496</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ke&#xe7;eli</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Kaya</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Can</surname>
<given-names>A. B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>3D skeletal volume templates for deep learning-based activity recognition</article-title>. <source>Electronics</source> <volume>11</volume> (<issue>21</issue>), <fpage>3567</fpage>. <pub-id pub-id-type="doi">10.3390/electronics11213567</pub-id>
</mixed-citation>
</ref>
<ref id="B67">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ahmed Rafique</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Health gaming based activity recognition using body-worn sensors via artificial neural network</article-title>. <source>ComTech</source> <volume>2025</volume>, <fpage>2</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1109/ComTech65062.2025.11034569</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kocabas</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Athanasiou</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Black</surname>
<given-names>M. J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>VIBE: video inference for human body pose and shape estimation</article-title>. <source>Proc. CVPR</source> <volume>2020</volume>, <fpage>5253</fpage>&#x2013;<lpage>5263</lpage>.<pub-id pub-id-type="doi">10.48550/arXiv.1912.05656</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>E. K.</given-names>
</name>
<name>
<surname>Suneetha</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rajasekhar</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Human action recognition from depth sensor via skeletal joint and shape trajectories with a time-series graph matching</article-title>,&#x201d; in <conf-name>Proc. Int. Conf. Signal Process. Commun. Eng. Syst. (SPACES)</conf-name>, <conf-loc>Andhra Pradesh, India</conf-loc>, <conf-date>June 11&#x2013;12, 2021</conf-date>. <pub-id pub-id-type="doi">10.1063/5.0111612</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Leutenegger</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Siegwart</surname>
<given-names>R. Y.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>BRISK: binary robust invariant scalable keypoints</article-title>,&#x201d; in <conf-name>Proc. IEEE Int. Conf. Computer Vision (ICCV)</conf-name>, <conf-loc>Barcelona, Spain</conf-loc>, <conf-date>06-13 November 2011</conf-date>, <fpage>2548</fpage>&#x2013;<lpage>2555</lpage>. <pub-id pub-id-type="doi">10.1109/iccv.2011.6126542</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Action recognition from RGB video using pose estimation and graph CNNs</article-title>,&#x201d; in <source>Proc. IEEE Int. Conf. Image Processing (ICIP)</source>, <fpage>2354</fpage>&#x2013;<lpage>2358</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ning</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Video swin transformer</article-title>. <source>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</source> <volume>2022</volume>, <fpage>3202</fpage>&#x2013;<lpage>3211</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2106.13230</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Loper</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mahmood</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Romero</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pons-Moll</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Black</surname>
<given-names>M. J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>SMPL: a skinned multi-person linear model</article-title>. <source>ACM Trans. Graph.</source> <volume>34</volume> (<issue>6</issue>), <fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1145/2816795.2818013</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lowe</surname>
<given-names>D. G.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Distinctive image features from scale-invariant keypoints</article-title>. <source>Int. J. Comput. Vis.</source> <volume>60</volume>, <fpage>91</fpage>&#x2013;<lpage>110</lpage>. <pub-id pub-id-type="doi">10.1023/b:visi.0000029664.99615.94</pub-id>
</mixed-citation>
</ref>
<ref id="B68">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nadeem</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Accurate physical activity recognition using multidimensional features and markov model for smart health fitness</article-title>. <source>Symmetry</source> <volume>12</volume> (<issue>10</issue>), <fpage>115</fpage>. <pub-id pub-id-type="doi">10.3390/sym12111766</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Nazar</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>Wearable sensors-based activity classification for intelligent healthcare monitoring</article-title>,&#x201d; in <source>Proc. International conference on advancements in computational sciences (ICACS)</source>, <fpage>6</fpage>&#x2013;<lpage>12</lpage>.</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Panigrahi</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Sahoo</surname>
<given-names>P. K.</given-names>
</name>
<name>
<surname>Panda</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Panda</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A ResNet-101 deep learning framework induced transfer learning strategy for moving object detection</article-title>. <source>Image Vis. Comput.</source> <volume>146</volume>, <fpage>105021</fpage>. <pub-id pub-id-type="doi">10.1016/j.imavis.2024.105021</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pavlakos</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Derpanis</surname>
<given-names>K. G.</given-names>
</name>
<name>
<surname>Daniilidis</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Coarse-to-Fine volumetric prediction for single-image 3D human pose</article-title>. <source>Proc. CVPR</source> <volume>2017</volume>, <fpage>7025</fpage>&#x2013;<lpage>7034</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1611.07828</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rublee</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Rabaud</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Konolige</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bradski</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>ORB: an efficient alternative to SIFT or SURF</article-title>. <source>Proc. IEEE Int. Conf. Comput. Vis. (ICCV)</source> <volume>2011</volume>, <fpage>2564</fpage>&#x2013;<lpage>2571</lpage>. <pub-id pub-id-type="doi">10.1109/iccv.2011.6126544</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>&#x015E;ahin</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Evaluation of wearable technologies in physiotherapy and rehabilitation</article-title>. <source>BAU Health Innov.</source> <fpage>91</fpage>&#x2013;<lpage>100</lpage>. <pub-id pub-id-type="doi">10.14744/bauh.2023.43531</pub-id>
</mixed-citation>
</ref>
<ref id="B62">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saqlain</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Cha</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Baek</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>3DMesh-GAR: 3D human body mesh-based method for group activity recognition</article-title>. <source>Sensors</source> <volume>22</volume> (<issue>4</issue>), <fpage>1464</fpage>. <pub-id pub-id-type="doi">10.3390/s22041464</pub-id>
<pub-id pub-id-type="pmid">35214365</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tomasi</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>1994</year>). &#x201c;<article-title>Good features to track</article-title>,&#x201d; in <source>Proc. IEEE conf. Computer vision and pattern recognition (CVPR)</source>, <fpage>593</fpage>&#x2013;<lpage>600</lpage>.</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tayyab</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>Disabled rehabilitation monitoring and patients healthcare recognition using machine learning</article-title>,&#x201d; in <source>Proc. International conference on advancements in computational sciences (ICACS)</source>, <fpage>9</fpage>&#x2013;<lpage>15</lpage>.</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tayyab</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alateyah</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Alnusayri</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alatiyyah</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>AlHammadi</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>A hybrid approach for sports activity recognition using key body descriptors and hybrid deep learning classifier</article-title>. <source>Sensors</source> <volume>25</volume> (<issue>14</issue>), <fpage>441</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.3390/s25020441</pub-id>
<pub-id pub-id-type="pmid">39860811</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vaswani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shazeer</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Parmar</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Uszkoreit</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Gomez</surname>
<given-names>A. N.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Attention is all you need</article-title>. <source>Proc. Adv. Neural Inf. Process. Syst.</source> <volume>2017</volume>, <fpage>5998</fpage>&#x2013;<lpage>6008</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Klaser</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schmid</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C. L.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Action recognition by dense trajectories</article-title>. <source>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</source> <volume>2011</volume>, <fpage>3169</fpage>&#x2013;<lpage>3176</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2011.5995407</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xia</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>EMG-based estimation of limb movement using deep learning with recurrent convolutional neural networks</article-title>. <source>Artif. Intell. Med.</source> <volume>102</volume>, <fpage>101753</fpage>. <pub-id pub-id-type="doi">10.1111/aor.13004</pub-id>
<pub-id pub-id-type="pmid">29068076</pub-id>
</mixed-citation>
</ref>
<ref id="B60">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Scene image and human skeleton-based dual-stream human action recognition</article-title>. <source>Pattern Recognit. Lett.</source> <volume>148</volume>, <fpage>136</fpage>&#x2013;<lpage>145</lpage>. <pub-id pub-id-type="doi">10.1016/j.patrec.2021.06.003</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Recognizing actions using depth motion maps-based histograms of oriented gradients</article-title>. <source>ACM Int. Conf. Multimed.</source> <volume>2012</volume>, <fpage>1057</fpage>&#x2013;<lpage>1060</lpage>. <pub-id pub-id-type="doi">10.1145/2393347.2396382</pub-id>
</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zaher</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ghoneim</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Abdelhamid</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Atia</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Unlocking the potential of RNN and CNN models for accurate rehabilitation exercise classification on multi-datasets</article-title>. <source>Multimed. Tools Appl.</source> <volume>84</volume> (<issue>3</issue>), <fpage>1261</fpage>&#x2013;<lpage>1301</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-024-19092-0</pub-id>
</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zaher</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ghoneim</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Abdelhamid</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Atia</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Rehabilitation monitoring and assessment: a comparative analysis of feature engineering and machine learning algorithms on the UI-PRMD and KIMORE benchmark datasets</article-title>. <source>J. Inf. Telecommun.</source> <volume>2025</volume>, <fpage>382</fpage>&#x2013;<lpage>402</lpage>. <pub-id pub-id-type="doi">10.1080/24751839.2025.2454053</pub-id>
</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zanfir</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Marinoiu</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Sminchisescu</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Monocular 3D pose and shape estimation of multiple people in natural scenes&#x2014;the importance of multiple scene constraints</article-title>. <source>Proc. CVPR</source> <volume>2018</volume>, <fpage>2148</fpage>&#x2013;<lpage>2157</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2018.00229</pub-id>
</mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>T. Y.</given-names>
</name>
<name>
<surname>Suen</surname>
<given-names>C. Y.</given-names>
</name>
</person-group> (<year>1984</year>). <article-title>A fast parallel algorithm for thinning digital patterns</article-title>. <source>Commun. ACM</source> <volume>27</volume>, <fpage>236</fpage>&#x2013;<lpage>239</lpage>. <pub-id pub-id-type="doi">10.1145/357994.358023</pub-id>
</mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Physical activity recognition for elderly using hybrid deep models with single IMU</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>100104</fpage>&#x2013;<lpage>100113</lpage>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>